SIGN IN SIGN UP
EpistasisLab / tpot UNCLAIMED

A Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.

0 0 0 Jupyter Notebook
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from tpot import TPOTClassifier\n",
"from sklearn.datasets import make_classification\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"NSAMPLES = 50000\n",
"NFEATURES = 20\n",
"SEED = 12\n",
"\n",
"# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n",
"X, y = make_classification(\n",
" n_samples=NSAMPLES,\n",
" n_features=NFEATURES,\n",
" n_informative=NFEATURES,\n",
" n_redundant=0,\n",
" class_sep=0.55,\n",
" n_classes=2,\n",
" random_state=SEED,\n",
" \n",
")\n",
"\n",
"X = X.astype(\"float32\")\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=SEED)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current best internal CV score: 0.9695733333333334\n",
"Generation 2 - Current best internal CV score: 0.9695733333333334\n",
"Generation 3 - Current best internal CV score: 0.9695733333333334\n",
"Generation 4 - Current best internal CV score: 0.9705333333333334\n",
"Generation 5 - Current best internal CV score: 0.9705333333333334\n",
"Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=20, weights=uniform)\n",
"0.97704\n"
]
}
],
"source": [
"# TPOT setup\n",
"GENERATIONS = 5\n",
"POP_SIZE = 100\n",
"CV = 5\n",
"\n",
"tpot = TPOTClassifier(\n",
" generations=GENERATIONS,\n",
" population_size=POP_SIZE,\n",
" random_state=SEED,\n",
" config_dict=\"TPOT cuML\",\n",
" n_jobs=1, # cuML requires n_jobs=1, the default\n",
" cv=CV,\n",
" verbosity=2,\n",
")\n",
"\n",
"tpot.fit(X_train, y_train)\n",
"\n",
"preds = tpot.predict(X_test)\n",
"print(accuracy_score(y_test, preds))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"import numpy as np\n",
"import pandas as pd\n",
"from cuml.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n",
"tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n",
"features = tpot_data.drop('target', axis=1)\n",
"training_features, testing_features, training_target, testing_target = \\\n",
" train_test_split(features, tpot_data['target'], random_state=12)\n",
"\n",
"# Average CV score on the training set was: 0.9705333333333334\n",
"exported_pipeline = KNeighborsClassifier(n_neighbors=20, weights=\"uniform\")\n",
"# Fix random state in exported estimator\n",
"if hasattr(exported_pipeline, 'random_state'):\n",
" setattr(exported_pipeline, 'random_state', 12)\n",
"\n",
"exported_pipeline.fit(training_features, training_target)\n",
"results = exported_pipeline.predict(testing_features)\n",
"\n"
]
}
],
"source": [
"tpot.export('tpot_classification_cuml_pipeline.py')\n",
"print(tpot.export())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}