From df28f7edaf6adfef499a767ce15c852ccf66d11a Mon Sep 17 00:00:00 2001 From: Jake Walker Date: Thu, 22 Feb 2024 16:59:37 +0000 Subject: [PATCH] Add breast cancer data exploration --- PCA Clustering.ipynb | 642 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 641 insertions(+), 1 deletion(-) diff --git a/PCA Clustering.ipynb b/PCA Clustering.ipynb index 78e4816..3140b67 100644 --- a/PCA Clustering.ipynb +++ b/PCA Clustering.ipynb @@ -10,11 +10,651 @@ "\n", "From: " ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Breast Cancer Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "\n", + "breast = load_breast_cancer()\n", + "breast_data = breast.data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(569, 30)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breast_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(569,)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breast_labels = breast.target\n", + "breast_labels.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "labels = np.reshape(breast_labels, (569, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(569, 31)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_breast_data = np.concatenate([breast_data, labels], axis=1)\n", + "final_breast_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "breast_dataset = pd.DataFrame(final_breast_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',\n", + " 'mean smoothness', 'mean compactness', 'mean concavity',\n", + " 'mean concave points', 'mean symmetry', 'mean fractal dimension',\n", + " 'radius error', 'texture error', 'perimeter error', 'area error',\n", + " 'smoothness error', 'compactness error', 'concavity error',\n", + " 'concave points error', 'symmetry error',\n", + " 'fractal dimension error', 'worst radius', 'worst texture',\n", + " 'worst perimeter', 'worst area', 'worst smoothness',\n", + " 'worst compactness', 'worst concavity', 'worst concave points',\n", + " 'worst symmetry', 'worst fractal dimension'], dtype='\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimensionlabel
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...17.33184.602019.00.16220.66560.71190.26540.46010.118900.0
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...23.41158.801956.00.12380.18660.24160.18600.27500.089020.0
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...25.53152.501709.00.14440.42450.45040.24300.36130.087580.0
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...26.5098.87567.70.20980.86630.68690.25750.66380.173000.0
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...16.67152.201575.00.13740.20500.40000.16250.23640.076780.0
\n", + "

5 rows × 31 columns

\n", + "" + ], + "text/plain": [ + " mean radius mean texture mean perimeter mean area mean smoothness \\\n", + "0 17.99 10.38 122.80 1001.0 0.11840 \n", + "1 20.57 17.77 132.90 1326.0 0.08474 \n", + "2 19.69 21.25 130.00 1203.0 0.10960 \n", + "3 11.42 20.38 77.58 386.1 0.14250 \n", + "4 20.29 14.34 135.10 1297.0 0.10030 \n", + "\n", + " mean compactness mean concavity mean concave points mean symmetry \\\n", + "0 0.27760 0.3001 0.14710 0.2419 \n", + "1 0.07864 0.0869 0.07017 0.1812 \n", + "2 0.15990 0.1974 0.12790 0.2069 \n", + "3 0.28390 0.2414 0.10520 0.2597 \n", + "4 0.13280 0.1980 0.10430 0.1809 \n", + "\n", + " mean fractal dimension ... worst texture worst perimeter worst area \\\n", + "0 0.07871 ... 17.33 184.60 2019.0 \n", + "1 0.05667 ... 23.41 158.80 1956.0 \n", + "2 0.05999 ... 25.53 152.50 1709.0 \n", + "3 0.09744 ... 26.50 98.87 567.7 \n", + "4 0.05883 ... 16.67 152.20 1575.0 \n", + "\n", + " worst smoothness worst compactness worst concavity worst concave points \\\n", + "0 0.1622 0.6656 0.7119 0.2654 \n", + "1 0.1238 0.1866 0.2416 0.1860 \n", + "2 0.1444 0.4245 0.4504 0.2430 \n", + "3 0.2098 0.8663 0.6869 0.2575 \n", + "4 0.1374 0.2050 0.4000 0.1625 \n", + "\n", + " worst symmetry worst fractal dimension label \n", + "0 0.4601 0.11890 0.0 \n", + "1 0.2750 0.08902 0.0 \n", + "2 0.3613 0.08758 0.0 \n", + "3 0.6638 0.17300 0.0 \n", + "4 0.2364 0.07678 0.0 \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breast_dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\JakeWalker\\AppData\\Local\\Temp\\ipykernel_17096\\3450579118.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " breast_dataset[\"label\"].replace(0, \"Benign\", inplace=True)\n" + ] + } + ], + "source": [ + "breast_dataset[\"label\"].replace(0, \"Benign\", inplace=True)\n", + "breast_dataset[\"label\"].replace(1, \"Malignant\", inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimensionlabel
56421.5622.39142.001479.00.111000.115900.243900.138900.17260.05623...26.40166.102027.00.141000.211300.41070.22160.20600.07115Benign
56520.1328.25131.201261.00.097800.103400.144000.097910.17520.05533...38.25155.001731.00.116600.192200.32150.16280.25720.06637Benign
56616.6028.08108.30858.10.084550.102300.092510.053020.15900.05648...34.12126.701124.00.113900.309400.34030.14180.22180.07820Benign
56720.6029.33140.101265.00.117800.277000.351400.152000.23970.07016...39.42184.601821.00.165000.868100.93870.26500.40870.12400Benign
5687.7624.5447.92181.00.052630.043620.000000.000000.15870.05884...30.3759.16268.60.089960.064440.00000.00000.28710.07039Malignant
\n", + "

5 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " mean radius mean texture mean perimeter mean area mean smoothness \\\n", + "564 21.56 22.39 142.00 1479.0 0.11100 \n", + "565 20.13 28.25 131.20 1261.0 0.09780 \n", + "566 16.60 28.08 108.30 858.1 0.08455 \n", + "567 20.60 29.33 140.10 1265.0 0.11780 \n", + "568 7.76 24.54 47.92 181.0 0.05263 \n", + "\n", + " mean compactness mean concavity mean concave points mean symmetry \\\n", + "564 0.11590 0.24390 0.13890 0.1726 \n", + "565 0.10340 0.14400 0.09791 0.1752 \n", + "566 0.10230 0.09251 0.05302 0.1590 \n", + "567 0.27700 0.35140 0.15200 0.2397 \n", + "568 0.04362 0.00000 0.00000 0.1587 \n", + "\n", + " mean fractal dimension ... worst texture worst perimeter worst area \\\n", + "564 0.05623 ... 26.40 166.10 2027.0 \n", + "565 0.05533 ... 38.25 155.00 1731.0 \n", + "566 0.05648 ... 34.12 126.70 1124.0 \n", + "567 0.07016 ... 39.42 184.60 1821.0 \n", + "568 0.05884 ... 30.37 59.16 268.6 \n", + "\n", + " worst smoothness worst compactness worst concavity \\\n", + "564 0.14100 0.21130 0.4107 \n", + "565 0.11660 0.19220 0.3215 \n", + "566 0.11390 0.30940 0.3403 \n", + "567 0.16500 0.86810 0.9387 \n", + "568 0.08996 0.06444 0.0000 \n", + "\n", + " worst concave points worst symmetry worst fractal dimension label \n", + "564 0.2216 0.2060 0.07115 Benign \n", + "565 0.1628 0.2572 0.06637 Benign \n", + "566 0.1418 0.2218 0.07820 Benign \n", + "567 0.2650 0.4087 0.12400 Benign \n", + "568 0.0000 0.2871 0.07039 Malignant \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "breast_dataset.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CIFAR-10 Data Exploration" + ] } ], "metadata": { + "kernelspec": { + "display_name": "data-science-research-W2__OPFf-py3.11", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" } }, "nbformat": 4,