{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BostonHousing.csv column name exlanation\n", "\n", "* CRIM - per capita crime rate by town\n", "* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.\n", "* INDUS - proportion of non-retail business acres per town.\n", "* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)\n", "* NOX - nitric oxides concentration (parts per 10 million)\n", "* RM - average number of rooms per dwelling\n", "* AGE - proportion of owner-occupied units built prior to 1940\n", "* DIS - weighted distances to five Boston employment centres\n", "* RAD - index of accessibility to radial highways\n", "* TAX - full-value property-tax rate per $10,000\n", "* PTRATIO - pupil-teacher ratio by town\n", "* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", "* LSTAT - % lower status of the population\n", "* MEDV - Median value of owner-occupied homes in $1000's" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dEH8W9IwdmQZ", "outputId": "ef688dd6-8d1c-4c7d-deab-2811c8211f11" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " crim zn indus chas nox rm age dis rad tax ptratio \\\n", "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 \n", "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 \n", "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 \n", "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 \n", "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 \n", "\n", " b lstat medv \n", "0 396.90 4.98 24.0 \n", "1 396.90 9.14 21.6 \n", "2 392.83 4.03 34.7 \n", "3 394.63 2.94 33.4 \n", "4 396.90 5.33 36.2 \n" ] } ], "source": [ "# prompt: read bostonhousing.csv\n", "\n", "import pandas as pd\n", "\n", "# Assuming your CSV file is in your Google Drive's My Drive folder\n", "file_path = 'BostonHousing.csv'\n", "\n", "try:\n", " df = pd.read_csv(file_path)\n", " print(df.head()) # Display the first few rows of the DataFrame\n", "except FileNotFoundError:\n", " print(f\"File not found at: {file_path}\")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "w64J8uyRd9nM" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "import statsmodels.api as sm\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "56EO76-AeLY1" }, "outputs": [], "source": [ "X = df.drop(['medv'], axis=1)\n", "y = pd.Series(df.medv, name='PRICE')\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "m0rOKtaHgqOE" }, "outputs": [], "source": [ "# Step 1: Dummify (One-Hot Encode) the 'RAD' column\n", "# RAD: index of accessibility to radial highways (categorical)\n", "X['rad'] = X['rad'].astype(int) # Ensure it's treated as an integer categorical column\n", "X = pd.get_dummies(X, columns=['rad'], drop_first=True) # Drop first to avoid multicollinearity\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nsH1CqrzhTFv", "outputId": "1a0c18b3-5055-4f3e-c40d-3ee191c531b4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Non-numeric columns: Index(['rad_2', 'rad_3', 'rad_4', 'rad_5', 'rad_6', 'rad_7', 'rad_8',\n", " 'rad_24'],\n", " dtype='object')\n" ] } ], "source": [ "# Check for non-numeric columns\n", "non_numeric_cols = X.select_dtypes(exclude=np.number).columns\n", "print(f\"Non-numeric columns: {non_numeric_cols}\") # Print out any non-numeric columns\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "1avLuccfgzjL", "outputId": "2ead3029-2ce9-4c7b-823a-a367268da456" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
crimzninduschasnoxrmagedistaxptratioblstatrad_2rad_3rad_4rad_5rad_6rad_7rad_8rad_24
00.0063218.02.3100.5386.57565.24.090029615.3396.904.98FalseFalseFalseFalseFalseFalseFalseFalse
10.027310.07.0700.4696.42178.94.967124217.8396.909.14TrueFalseFalseFalseFalseFalseFalseFalse
20.027290.07.0700.4697.18561.14.967124217.8392.834.03TrueFalseFalseFalseFalseFalseFalseFalse
30.032370.02.1800.4586.99845.86.062222218.7394.632.94FalseTrueFalseFalseFalseFalseFalseFalse
40.069050.02.1800.4587.14754.26.062222218.7396.905.33FalseTrueFalseFalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " crim zn indus chas nox rm age dis tax ptratio \\\n", "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 296 15.3 \n", "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 242 17.8 \n", "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 242 17.8 \n", "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 222 18.7 \n", "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 222 18.7 \n", "\n", " b lstat rad_2 rad_3 rad_4 rad_5 rad_6 rad_7 rad_8 rad_24 \n", "0 396.90 4.98 False False False False False False False False \n", "1 396.90 9.14 True False False False False False False False \n", "2 392.83 4.03 True False False False False False False False \n", "3 394.63 2.94 False True False False False False False False \n", "4 396.90 5.33 False True False False False False False False " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "LPtH9jgsiI6W" }, "outputs": [], "source": [ "# prompt: convert non_numeric_cols in the dataframe to interger type by replace True = 1 and False = 0\n", "\n", "# Assuming 'X' is your DataFrame and 'non_numeric_cols' contains the non-numeric columns\n", "for col in non_numeric_cols:\n", " X[col] = X[col].astype(int)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7sFyGgUefP6Z", "outputId": "9ba50075-5b00-47c5-a545-68ec26a71edc" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Full:\n", "\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: PRICE R-squared: 0.763\n", "Model: OLS Adj. R-squared: 0.750\n", "Method: Least Squares F-statistic: 61.59\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 2.26e-106\n", "Time: 17:06:34 Log-Likelihood: -1184.4\n", "No. Observations: 404 AIC: 2411.\n", "Df Residuals: 383 BIC: 2495.\n", "Df Model: 20 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 28.9224 6.044 4.785 0.000 17.039 40.806\n", "crim -0.1151 0.034 -3.342 0.001 -0.183 -0.047\n", "zn 0.0391 0.016 2.395 0.017 0.007 0.071\n", "indus 0.0576 0.069 0.832 0.406 -0.079 0.194\n", "chas 2.5867 0.951 2.721 0.007 0.718 4.456\n", "nox -17.9762 4.368 -4.115 0.000 -26.565 -9.388\n", "rm 4.2873 0.467 9.176 0.000 3.369 5.206\n", "age -0.0076 0.015 -0.523 0.602 -0.036 0.021\n", "dis -1.5336 0.228 -6.721 0.000 -1.982 -1.085\n", "tax -0.0063 0.004 -1.437 0.151 -0.015 0.002\n", "ptratio -0.9041 0.158 -5.709 0.000 -1.215 -0.593\n", "b 0.0125 0.003 4.289 0.000 0.007 0.018\n", "lstat -0.5086 0.055 -9.183 0.000 -0.617 -0.400\n", "rad_2 1.0733 1.580 0.679 0.497 -2.033 4.179\n", "rad_3 4.7214 1.446 3.265 0.001 1.879 7.564\n", "rad_4 1.8003 1.307 1.377 0.169 -0.770 4.370\n", "rad_5 2.6805 1.326 2.022 0.044 0.074 5.287\n", "rad_6 0.3873 1.573 0.246 0.806 -2.706 3.481\n", "rad_7 4.3494 1.686 2.580 0.010 1.035 7.664\n", "rad_8 4.1388 1.609 2.572 0.010 0.975 7.302\n", "rad_24 5.8763 2.017 2.913 0.004 1.910 9.843\n", "==============================================================================\n", "Omnibus: 139.190 Durbin-Watson: 2.157\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 664.889\n", "Skew: 1.416 Prob(JB): 4.18e-145\n", "Kurtosis: 8.611 Cond. No. 1.58e+04\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.58e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "# Step 2: Add constant for intercept\n", "X_const = sm.add_constant(X)\n", "\n", "# Step 3: Split the data into train and test sets\n", "X_trainf, X_testf, y_trainf, y_testf = train_test_split(X_const, y, test_size=0.2, random_state=42)\n", "\n", "# Step 4: Build the multiple linear regression model without interaction terms\n", "model_full = sm.OLS(y_trainf, X_trainf).fit()\n", "print(\"Model Full:\\n\")\n", "print(model_full.summary())" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C8Czv7nfejsr", "outputId": "679bcdd3-c8b2-4245-880b-75b7c3cff484" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Model with Interaction Terms:\n", "\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: PRICE R-squared: 0.815\n", "Model: OLS Adj. R-squared: 0.805\n", "Method: Least Squares F-statistic: 80.34\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 5.59e-126\n", "Time: 17:24:59 Log-Likelihood: -1133.8\n", "No. Observations: 404 AIC: 2312.\n", "Df Residuals: 382 BIC: 2400.\n", "Df Model: 21 \n", "Covariance Type: nonrobust \n", "========================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "----------------------------------------------------------------------------------------\n", "const 1.9952 5.931 0.336 0.737 -9.666 13.657\n", "crim -0.1415 0.031 -4.632 0.000 -0.202 -0.081\n", "zn 0.0242 0.014 1.674 0.095 -0.004 0.053\n", "indus 0.0699 0.061 1.143 0.254 -0.050 0.190\n", "chas 2.0748 0.841 2.467 0.014 0.421 3.729\n", "nox -16.2326 3.862 -4.203 0.000 -23.827 -8.638\n", "rm 8.2557 0.561 14.705 0.000 7.152 9.360\n", "age 0.0065 0.013 0.505 0.614 -0.019 0.032\n", "dis -1.1859 0.204 -5.804 0.000 -1.588 -0.784\n", "tax -0.0088 0.004 -2.270 0.024 -0.016 -0.001\n", "ptratio -0.7147 0.141 -5.066 0.000 -0.992 -0.437\n", "b 0.0056 0.003 2.116 0.035 0.000 0.011\n", "lstat 1.7736 0.224 7.909 0.000 1.333 2.215\n", "rad_2 0.9323 1.396 0.668 0.505 -1.812 3.676\n", "rad_3 4.3100 1.278 3.373 0.001 1.797 6.823\n", "rad_4 2.2297 1.156 1.930 0.054 -0.042 4.502\n", "rad_5 2.9151 1.172 2.488 0.013 0.612 5.219\n", "rad_6 1.7916 1.396 1.283 0.200 -0.954 4.537\n", "rad_7 3.8107 1.490 2.557 0.011 0.881 6.741\n", "rad_8 2.9475 1.426 2.067 0.039 0.144 5.751\n", "rad_24 7.3393 1.788 4.105 0.000 3.824 10.854\n", "LSTAT_RM_Interaction -0.3985 0.038 -10.428 0.000 -0.474 -0.323\n", "==============================================================================\n", "Omnibus: 185.942 Durbin-Watson: 2.068\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 2675.817\n", "Skew: 1.562 Prob(JB): 0.00\n", "Kurtosis: 15.215 Cond. No. 1.73e+04\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.73e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "# Step 5: Add interaction term (LSTAT * RM)\n", "X['LSTAT_RM_Interaction'] = X['lstat'] * X['rm']\n", "\n", "# Add constant term again for the model with interaction\n", "X_with_interaction_const = sm.add_constant(X)\n", "\n", "# Step 6: Split the interaction dataset into train and test sets\n", "X_train_interaction, X_test_interaction, y_train_interaction, y_test_interaction = train_test_split(X_with_interaction_const, y, test_size=0.2, random_state=42)\n", "\n", "# Step 7: Build the multiple linear regression model with interaction terms\n", "model_with_interaction = sm.OLS(y_train_interaction, X_train_interaction).fit()\n", "print(\"\\nModel with Interaction Terms:\\n\")\n", "print(model_with_interaction.summary())" ] }, { "cell_type": "markdown", "metadata": { "id": "urdNq9KlxCpF" }, "source": [ "The presence of an interaction indicates that the effect of one predictor variable on the response variable is different at different values of the other predictor variable." ] }, { "cell_type": "markdown", "metadata": { "id": "B0vzKmXdxKvC" }, "source": [ "Reference: https://aarongullickson.github.io/stat_book/interaction-terms.html" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VIF Scores before removing high VIF features:\n", " feature VIF\n", "0 crim 2.171398\n", "1 zn 3.156594\n", "2 indus 15.951465\n", "3 chas 1.186896\n", "4 nox 88.591697\n", "5 rm 133.750526\n", "6 age 22.309314\n", "7 dis 15.454549\n", "8 tax 67.417818\n", "9 ptratio 107.071630\n", "10 b 22.268511\n", "11 lstat 178.024370\n", "12 rad_2 2.293878\n", "13 rad_3 2.983141\n", "14 rad_4 6.890922\n", "15 rad_5 7.198941\n", "16 rad_6 2.426500\n", "17 rad_7 1.905462\n", "18 rad_8 2.435413\n", "19 rad_24 19.143322\n", "20 LSTAT_RM_Interaction 193.434543\n" ] } ], "source": [ "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", "\n", "# Calculate VIF Scores\n", "vif_data = pd.DataFrame()\n", "vif_data[\"feature\"] = X.columns\n", "#vif_data[\"VIF\"] = [variance_inflation_factor(X.values, i+1) for i in range(X.shape[1])]\n", "vif_data[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] # The index should start from 0 for the first column\n", "\n", "print(\"VIF Scores before removing high VIF features:\")\n", "print(vif_data)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 4 }