A Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 4,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"from tpot import TPOTClassifier\n",
|
||
|
|
"from sklearn.datasets import make_classification\n",
|
||
|
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
|
"from sklearn.metrics import accuracy_score"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 5,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"NSAMPLES = 50000\n",
|
||
|
|
"NFEATURES = 20\n",
|
||
|
|
"SEED = 12\n",
|
||
|
|
"\n",
|
||
|
|
"# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n",
|
||
|
|
"X, y = make_classification(\n",
|
||
|
|
" n_samples=NSAMPLES,\n",
|
||
|
|
" n_features=NFEATURES,\n",
|
||
|
|
" n_informative=NFEATURES,\n",
|
||
|
|
" n_redundant=0,\n",
|
||
|
|
" class_sep=0.55,\n",
|
||
|
|
" n_classes=2,\n",
|
||
|
|
" random_state=SEED,\n",
|
||
|
|
" \n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"X = X.astype(\"float32\")\n",
|
||
|
|
"\n",
|
||
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=SEED)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 6,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"application/vnd.jupyter.widget-view+json": {
|
||
|
|
"model_id": "",
|
||
|
|
"version_major": 2,
|
||
|
|
"version_minor": 0
|
||
|
|
},
|
||
|
|
"text/plain": [
|
||
|
|
"HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"\n",
|
||
|
|
"Generation 1 - Current best internal CV score: 0.9695733333333334\n",
|
||
|
|
"Generation 2 - Current best internal CV score: 0.9695733333333334\n",
|
||
|
|
"Generation 3 - Current best internal CV score: 0.9695733333333334\n",
|
||
|
|
"Generation 4 - Current best internal CV score: 0.9705333333333334\n",
|
||
|
|
"Generation 5 - Current best internal CV score: 0.9705333333333334\n",
|
||
|
|
"Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=20, weights=uniform)\n",
|
||
|
|
"0.97704\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# TPOT setup\n",
|
||
|
|
"GENERATIONS = 5\n",
|
||
|
|
"POP_SIZE = 100\n",
|
||
|
|
"CV = 5\n",
|
||
|
|
"\n",
|
||
|
|
"tpot = TPOTClassifier(\n",
|
||
|
|
" generations=GENERATIONS,\n",
|
||
|
|
" population_size=POP_SIZE,\n",
|
||
|
|
" random_state=SEED,\n",
|
||
|
|
" config_dict=\"TPOT cuML\",\n",
|
||
|
|
" n_jobs=1, # cuML requires n_jobs=1, the default\n",
|
||
|
|
" cv=CV,\n",
|
||
|
|
" verbosity=2,\n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"tpot.fit(X_train, y_train)\n",
|
||
|
|
"\n",
|
||
|
|
"preds = tpot.predict(X_test)\n",
|
||
|
|
"print(accuracy_score(y_test, preds))"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 7,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"import numpy as np\n",
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"from cuml.neighbors import KNeighborsClassifier\n",
|
||
|
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
|
"\n",
|
||
|
|
"# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n",
|
||
|
|
"tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n",
|
||
|
|
"features = tpot_data.drop('target', axis=1)\n",
|
||
|
|
"training_features, testing_features, training_target, testing_target = \\\n",
|
||
|
|
" train_test_split(features, tpot_data['target'], random_state=12)\n",
|
||
|
|
"\n",
|
||
|
|
"# Average CV score on the training set was: 0.9705333333333334\n",
|
||
|
|
"exported_pipeline = KNeighborsClassifier(n_neighbors=20, weights=\"uniform\")\n",
|
||
|
|
"# Fix random state in exported estimator\n",
|
||
|
|
"if hasattr(exported_pipeline, 'random_state'):\n",
|
||
|
|
" setattr(exported_pipeline, 'random_state', 12)\n",
|
||
|
|
"\n",
|
||
|
|
"exported_pipeline.fit(training_features, training_target)\n",
|
||
|
|
"results = exported_pipeline.predict(testing_features)\n",
|
||
|
|
"\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"tpot.export('tpot_classification_cuml_pipeline.py')\n",
|
||
|
|
"print(tpot.export())"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "Python 3",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"codemirror_mode": {
|
||
|
|
"name": "ipython",
|
||
|
|
"version": 3
|
||
|
|
},
|
||
|
|
"file_extension": ".py",
|
||
|
|
"mimetype": "text/x-python",
|
||
|
|
"name": "python",
|
||
|
|
"nbconvert_exporter": "python",
|
||
|
|
"pygments_lexer": "ipython3",
|
||
|
|
"version": "3.7.8"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 4
|
||
|
|
}
|