Blame: tutorials/cuML_Classification_Example.ipynb - EpistasisLab/tpot

A Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.

0 0 0 Jupyter Notebook

Revert "Deployed 7ccda9a with MkDocs version: 1.3.0" This reverts commit bd9629c40e01241766197119b581a99409b07068. 2022-06-15 11:28:30 -07:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			"This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML."
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from tpot import TPOTClassifier\n",`
			`"from sklearn.datasets import make_classification\n",`
			`"from sklearn.model_selection import train_test_split\n",`
			`"from sklearn.metrics import accuracy_score"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"NSAMPLES = 50000\n",`
			`"NFEATURES = 20\n",`
			`"SEED = 12\n",`
			`"\n",`
			`"# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n",`
			`"X, y = make_classification(\n",`
			`" n_samples=NSAMPLES,\n",`
			`" n_features=NFEATURES,\n",`
			`" n_informative=NFEATURES,\n",`
			`" n_redundant=0,\n",`
			`" class_sep=0.55,\n",`
			`" n_classes=2,\n",`
			`" random_state=SEED,\n",`
			`" \n",`
			`")\n",`
			`"\n",`
			`"X = X.astype(\"float32\")\n",`
			`"\n",`
			`"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=SEED)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			"Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)."
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`"HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"\n",`
			`"Generation 1 - Current best internal CV score: 0.9695733333333334\n",`
			`"Generation 2 - Current best internal CV score: 0.9695733333333334\n",`
			`"Generation 3 - Current best internal CV score: 0.9695733333333334\n",`
			`"Generation 4 - Current best internal CV score: 0.9705333333333334\n",`
			`"Generation 5 - Current best internal CV score: 0.9705333333333334\n",`
			`"Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=20, weights=uniform)\n",`
			`"0.97704\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"# TPOT setup\n",`
			`"GENERATIONS = 5\n",`
			`"POP_SIZE = 100\n",`
			`"CV = 5\n",`
			`"\n",`
			`"tpot = TPOTClassifier(\n",`
			`" generations=GENERATIONS,\n",`
			`" population_size=POP_SIZE,\n",`
			`" random_state=SEED,\n",`
			`" config_dict=\"TPOT cuML\",\n",`
			`" n_jobs=1, # cuML requires n_jobs=1, the default\n",`
			`" cv=CV,\n",`
			`" verbosity=2,\n",`
			`")\n",`
			`"\n",`
			`"tpot.fit(X_train, y_train)\n",`
			`"\n",`
			`"preds = tpot.predict(X_test)\n",`
			`"print(accuracy_score(y_test, preds))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"import numpy as np\n",`
			`"import pandas as pd\n",`
			`"from cuml.neighbors import KNeighborsClassifier\n",`
			`"from sklearn.model_selection import train_test_split\n",`
			`"\n",`
			`"# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n",`
			`"tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n",`
			`"features = tpot_data.drop('target', axis=1)\n",`
			`"training_features, testing_features, training_target, testing_target = \\\n",`
			`" train_test_split(features, tpot_data['target'], random_state=12)\n",`
			`"\n",`
			`"# Average CV score on the training set was: 0.9705333333333334\n",`
			`"exported_pipeline = KNeighborsClassifier(n_neighbors=20, weights=\"uniform\")\n",`
			`"# Fix random state in exported estimator\n",`
			`"if hasattr(exported_pipeline, 'random_state'):\n",`
			`" setattr(exported_pipeline, 'random_state', 12)\n",`
			`"\n",`
			`"exported_pipeline.fit(training_features, training_target)\n",`
			`"results = exported_pipeline.predict(testing_features)\n",`
			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"tpot.export('tpot_classification_cuml_pipeline.py')\n",`
			`"print(tpot.export())"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.7.8"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 4`
			`}`