{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 4 - Classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [Load dataset](#Load-dataset)\n", "- [The Default data set](#Figure-4.1---Default-data-set)\n", "- [4.3 Logistic Regression](#4.3-Logistic-Regression)\n", "- [4.4 Linear Discriminant Analysis](#4.4-Linear-Discriminant-Analysis)\n", "- [Lab: 4.6.3 Linear Discriminant Analysis](#4.6.3-Linear-Discriminant-Analysis)\n", "- [Lab: 4.6.4 Quadratic Discriminant Analysis](#4.6.4-Quadratic-Discriminant-Analysis)\n", "- [Lab: 4.6.5 K-Nearest Neighbors](#4.6.5-K-Nearest-Neighbors)\n", "- [Lab: 4.6.6 An Application to Caravan Insurance Data](#4.6.6-An-Application-to-Caravan-Insurance-Data)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import sklearn.linear_model as skl_lm\n", "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", "from sklearn.metrics import confusion_matrix, classification_report, precision_score\n", "from sklearn import preprocessing\n", "from sklearn import neighbors\n", "\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Load dataset" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n", " warn(\"Workbook contains no default style, apply openpyxl's default\")\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0defaultstudentbalanceincomedefault2student2
01NoNo729.52649544361.62507400
12NoYes817.18040712106.13470001
23NoNo1073.54916431767.13894700
\n", "
" ], "text/plain": [ " Unnamed: 0 default student balance income default2 student2\n", "0 1 No No 729.526495 44361.625074 0 0\n", "1 2 No Yes 817.180407 12106.134700 0 1\n", "2 3 No No 1073.549164 31767.138947 0 0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "# Install openpyxl\n", "df = pd.read_excel('data/Default.xlsx')\n", "\n", "# factorize() encodes the variable as a categorical variable\n", "df['default2'] = df.default.factorize()[0]\n", "df['student2'] = df.student.factorize()[0]\n", "df.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.3 Logistic Regression" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_train = df.balance.values.reshape(-1,1) \n", "y = df.default2\n", "\n", "# Create array of test data. Calculate the classification probability\n", "# and predicted classification.\n", "X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)\n", "\n", "clf = skl_lm.LogisticRegression(solver='newton-cg')\n", "clf.fit(X_train,y)\n", "prob = clf.predict_proba(X_test) ## return the probability of the classes (n, n_classes)\n", "\n", "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))\n", "# Left plot\n", "sns.regplot(data=df, x='balance', y='default2', order=1, ci=None,\n", " scatter_kws={'color':'orange'},\n", " line_kws={'color':'lightblue', 'lw':2}, ax=ax1)\n", "# Right plot\n", "ax2.scatter(X_train, y, color='orange')\n", "ax2.plot(X_test, prob[:,1], color='lightblue')\n", "\n", "for ax in fig.axes:\n", " ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],\n", " xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)\n", " ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],\n", " xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)\n", " ax.set_ylabel('Probability of default')\n", " ax.set_xlabel('Balance')\n", " ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])\n", " ax.set_xlim(xmin=-100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### scikit-learn" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression(solver='newton-cg')\n", "classes: [0 1]\n", "coefficients: [[0.00549891]]\n", "intercept : [-10.6513227]\n" ] } ], "source": [ "# Using newton-cg solver, the coefficients are equal/closest to the ones in the book. \n", "# I do not know the details on the differences between the solvers.\n", "clf = skl_lm.LogisticRegression(solver='newton-cg')\n", "X_train = df.balance.values.reshape(-1,1)\n", "y = df.default2\n", "clf.fit(X_train,y)\n", "print(clf)\n", "print('classes: ',clf.classes_)\n", "print('coefficients: ',clf.coef_)\n", "print('intercept :', clf.intercept_)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### statsmodels" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.079823\n", " Iterations 10\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Coef.Std.Err.zP>|z|[0.0250.975]
Intercept-10.6513310.361169-29.4912873.723665e-191-11.359208-9.943453
balance0.0054990.00022024.9524042.010855e-1370.0050670.005931
\n", "
" ], "text/plain": [ " Coef. Std.Err. z P>|z| [0.025 0.975]\n", "Intercept -10.651331 0.361169 -29.491287 3.723665e-191 -11.359208 -9.943453\n", "balance 0.005499 0.000220 24.952404 2.010855e-137 0.005067 0.005931" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "est = smf.logit('default2 ~ balance', df).fit()\n", "est.summary2().tables[1]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.145434\n", " Iterations 7\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Coef.Std.Err.zP>|z|[0.0250.975]
Intercept-3.5041280.070713-49.5540940.000000-3.642723-3.365532
student20.4048870.1150193.5201770.0004310.1794540.630320
\n", "
" ], "text/plain": [ " Coef. Std.Err. z P>|z| [0.025 0.975]\n", "Intercept -3.504128 0.070713 -49.554094 0.000000 -3.642723 -3.365532\n", "student2 0.404887 0.115019 3.520177 0.000431 0.179454 0.630320" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "est = smf.logit('default2 ~ student2', df).fit()\n", "est.summary2().tables[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Multiple Logistic Regression" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.078577\n", " Iterations 10\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Coef.Std.Err.zP>|z|[0.0250.975]
Intercept-10.8690450.492273-22.0793204.995499e-108-11.833882-9.904209
balance0.0057370.00023224.7365064.331521e-1350.0052820.006191
income0.0000030.0000080.3698087.115254e-01-0.0000130.000019
student2-0.6467760.236257-2.7375956.189022e-03-1.109831-0.183721
\n", "
" ], "text/plain": [ " Coef. Std.Err. z P>|z| [0.025 0.975]\n", "Intercept -10.869045 0.492273 -22.079320 4.995499e-108 -11.833882 -9.904209\n", "balance 0.005737 0.000232 24.736506 4.331521e-135 0.005282 0.006191\n", "income 0.000003 0.000008 0.369808 7.115254e-01 -0.000013 0.000019\n", "student2 -0.646776 0.236257 -2.737595 6.189022e-03 -1.109831 -0.183721" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "est = smf.logit('default2 ~ balance + income + student2', df).fit()\n", "est.summary2().tables[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Confounding" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# balance and default vectors for students\n", "X_train = df[df.student == 'Yes'].balance.values.reshape(-1, 1) \n", "y = df[df.student == 'Yes'].default2\n", "\n", "# balance and default vectors for non-students\n", "X_train2 = df[df.student == 'No'].balance.values.reshape(-1, 1) \n", "y2 = df[df.student == 'No'].default2\n", "\n", "# Vector with balance values for plotting\n", "X_test = np.arange(df.balance.min(), df.balance.max()).reshape(-1,1)\n", "\n", "clf = skl_lm.LogisticRegression(solver='newton-cg')\n", "clf2 = skl_lm.LogisticRegression(solver='newton-cg')\n", "\n", "clf.fit(X_train,y)\n", "clf2.fit(X_train2,y2)\n", "\n", "prob = clf.predict_proba(X_test)\n", "prob2 = clf2.predict_proba(X_test)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
defaultNoYes
student
No6850206
Yes2817127
\n", "
" ], "text/plain": [ "default No Yes\n", "student \n", "No 6850 206\n", "Yes 2817 127" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['student','default']).size().unstack('default')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", " if pd.api.types.is_categorical_dtype(vector):\n", "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", " if pd.api.types.is_categorical_dtype(vector):\n", "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", " if pd.api.types.is_categorical_dtype(vector):\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# creating plot\n", "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))\n", "\n", "# Left plot\n", "ax1.plot(X_test, pd.DataFrame(prob)[1], color='orange', label='Student')\n", "ax1.plot(X_test, pd.DataFrame(prob2)[1], color='lightblue', label='Non-student')\n", "\n", "ax1.set_ylabel('Default Rate')\n", "ax1.set_xlabel('Credit Card Balance')\n", "ax1.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.])\n", "ax1.set_xlim(450,2500)\n", "ax1.legend(loc=2)\n", "\n", "# Right plot\n", "sns.boxplot(x='student', y='balance', data=df, orient='v', ax=ax2, palette=c_palette)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.4 Linear Discriminant Analysis\n" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
True default statusNoYes
Predicted default status
No9645254
Yes2279
\n", "
" ], "text/plain": [ "True default status No Yes\n", "Predicted default status \n", "No 9645 254\n", "Yes 22 79" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df[['balance', 'income', 'student2']].to_numpy()\n", "y = df.default2.to_numpy()\n", "\n", "lda = LinearDiscriminantAnalysis(solver='svd')\n", "y_pred = lda.fit(X, y).predict(X)\n", "\n", "df_ = pd.DataFrame({'True default status': y,\n", " 'Predicted default status': y_pred})\n", "df_.replace(to_replace={0:'No', 1:'Yes'}, inplace=True)\n", "\n", "df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " No 0.97 1.00 0.99 9667\n", " Yes 0.78 0.24 0.36 333\n", "\n", " accuracy 0.97 10000\n", " macro avg 0.88 0.62 0.67 10000\n", "weighted avg 0.97 0.97 0.97 10000\n", "\n" ] } ], "source": [ "print(classification_report(y, y_pred, target_names=['No', 'Yes']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Instead of using the probability of 50% as decision boundary, we say that a probability of default of 20% is to be classified as 'Yes'. " ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
True default statusNoYes
Predicted default status
No9645254
Yes2279
\n", "
" ], "text/plain": [ "True default status No Yes\n", "Predicted default status \n", "No 9645 254\n", "Yes 22 79" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
True default statusNoYes
Predicted default status
False9518164
True149169
\n", "
" ], "text/plain": [ "True default status No Yes\n", "Predicted default status \n", "False 9518 164\n", "True 149 169" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Because many Yes samples are predicted as No\n", "decision_prob = 0.25\n", "y_prob = lda.fit(X, y).predict_proba(X)\n", "\n", "df_ = pd.DataFrame({'True default status': y,\n", " 'Predicted default status': y_prob[:,1] > decision_prob})\n", "df_.replace(to_replace={0:'No', 1:'Yes', 'True':'Yes', 'False':'No'}, inplace=True)\n", "\n", "df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 4.6.3 Linear Discriminant Analysis" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Year\n", "2005-01-01 Down\n", "2005-01-01 Down\n", "2005-01-01 Down\n", "2005-01-01 Up\n", "2005-01-01 Down\n", " ... \n", "2005-01-01 Up\n", "2005-01-01 Down\n", "2005-01-01 Up\n", "2005-01-01 Down\n", "2005-01-01 Down\n", "Name: Direction, Length: 252, dtype: object\n" ] } ], "source": [ "## divide the dataset into train and validation sets\n", "## Daily percentage returns for the S&P 500 stock\n", "X_train = df[:'2004'][['Lag1','Lag2']]\n", "y_train = df[:'2004']['Direction']\n", "\n", "X_test = df['2005':][['Lag1','Lag2']]\n", "y_test = df['2005':]['Direction']\n", "\n", "print(y_test)\n", "\n", "lda = LinearDiscriminantAnalysis()\n", "pred = lda.fit(X_train, y_train).predict(X_test)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.49198397, 0.50801603])" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda.priors_" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.04279022, 0.03389409],\n", " [-0.03954635, -0.03132544]])" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda.means_" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " Down 0.50 0.32 0.39 111\n", " Up 0.58 0.75 0.66 141\n", "\n", " accuracy 0.56 252\n", " macro avg 0.54 0.53 0.52 252\n", "weighted avg 0.55 0.56 0.54 252\n", "\n" ] } ], "source": [ "print(classification_report(y_test, pred, digits=3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 4.6.4 Quadratic Discriminant Analysis" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Down' 'Down' 'Up' 'Down' 'Down'\n", " 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Down'\n", " 'Up' 'Up' 'Up' 'Up' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Down' 'Down' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Down' 'Up' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Down' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Down' 'Down' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Down' 'Down' 'Up'\n", " 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'\n", " 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down'\n", " 'Up' 'Down' 'Up' 'Up' 'Down' 'Down' 'Up' 'Up' 'Down' 'Down' 'Up' 'Up'\n", " 'Down' 'Up' 'Up' 'Up' 'Up' 'Down' 'Down' 'Up' 'Up' 'Up' 'Down' 'Down'\n", " 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Up'\n", " 'Up' 'Up' 'Up' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up']\n" ] } ], "source": [ "qda = QuadraticDiscriminantAnalysis()\n", "pred = qda.fit(X_train, y_train).predict(X_test)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.49198397, 0.50801603])" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qda.priors_" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.04279022, 0.03389409],\n", " [-0.03954635, -0.03132544]])" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qda.means_" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " Down 0.600 0.270 0.373 111\n", " Up 0.599 0.858 0.706 141\n", "\n", " accuracy 0.599 252\n", " macro avg 0.600 0.564 0.539 252\n", "weighted avg 0.599 0.599 0.559 252\n", "\n" ] } ], "source": [ "print(classification_report(y_test, pred, digits=3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 4.6.5 K-Nearest Neighbors" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " Down 0.426 0.387 0.406 111\n", " Up 0.550 0.589 0.568 141\n", "\n", " accuracy 0.500 252\n", " macro avg 0.488 0.488 0.487 252\n", "weighted avg 0.495 0.500 0.497 252\n", "\n" ] } ], "source": [ "knn = neighbors.KNeighborsClassifier(n_neighbors=1)\n", "pred = knn.fit(X_train, y_train).predict(X_test)\n", "\n", "print(classification_report(y_test, pred, digits=3))" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " Down 0.466 0.432 0.449 111\n", " Up 0.577 0.610 0.593 141\n", "\n", " accuracy 0.532 252\n", " macro avg 0.522 0.521 0.521 252\n", "weighted avg 0.528 0.532 0.529 252\n", "\n" ] } ], "source": [ "knn = neighbors.KNeighborsClassifier(n_neighbors=3)\n", "pred = knn.fit(X_train, y_train).predict(X_test)\n", "\n", "print(classification_report(y_test, pred, digits=3))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.000 0.000 0.000 111\n", " 1 0.560 1.000 0.718 141\n", "\n", " accuracy 0.560 252\n", " macro avg 0.280 0.500 0.359 252\n", "weighted avg 0.313 0.560 0.401 252\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/Users/tz89p/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "df = pd.read_csv('data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)\n", "X_train = df[:'2004'][['Lag1','Lag2']].values\n", "y_train = df[:'2004']['Direction'].factorize()[0]\n", "\n", "X_test = df['2005':][['Lag1','Lag2']].values\n", "y_test = df['2005':]['Direction'].factorize()[0]\n", "\n", "clf = skl_lm.LogisticRegression(solver='newton-cg')\n", "clf.fit(X_train, y_train)\n", "\n", "prob = clf.predict_proba(X_test)[:,1]\n", "\n", "pred = prob > 0.4\n", "\n", "print(classification_report(y_test, pred, digits=3))\n", "\n", "\n", "\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 1 }