{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "id": "DSOtyxR9W07s" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 622 }, "id": "fQEohnQ6AGpq", "outputId": "3f1c34a0-7b76-4541-f948-f759feac15a5" }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Import necessary functions\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "# Sample dataset\n", "data = pd.DataFrame({\n", " 'House Size (sq.ft)': [2000, 2500, 1800, 2200, 2400, 2100, 2300],\n", " 'Number of Bedrooms': [3, 4, 3, 4, 5, 3, 4],\n", " 'House Age (yrs)': [5, 3, 8, 2, 4, 6, 7],\n", " 'House Price ($)': [300000, 400000, 280000, 350000, 380000, 320000, 360000]\n", "})\n", "\n", "# Calculate and visualize correlations\n", "correlation_matrix = data.corr()\n", "plt.figure(figsize=(10, 7))\n", "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')\n", "plt.title(\"Correlation Matrix\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0ieAcmDZ2Ifh", "outputId": "699bc058-5df6-4b39-9e78-437da5586681" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VIF Scores before removing high VIF features:\n", " feature VIF\n", "0 House Size (sq.ft) 92.304602\n", "1 Number of Bedrooms 75.623766\n", "2 House Age (yrs) 5.896487\n", "Initial Model Summary:\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: House Price ($) R-squared: 0.986\n", "Model: OLS Adj. R-squared: 0.972\n", "Method: Least Squares F-statistic: 71.09\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 0.00276\n", "Time: 16:34:04 Log-Likelihood: -69.163\n", "No. Observations: 7 AIC: 146.3\n", "Df Residuals: 3 BIC: 146.1\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "======================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "--------------------------------------------------------------------------------------\n", "const -2.854e+04 4.08e+04 -0.700 0.535 -1.58e+05 1.01e+05\n", "House Size (sq.ft) 160.5293 21.984 7.302 0.005 90.566 230.492\n", "Number of Bedrooms 5888.9303 6470.159 0.910 0.430 -1.47e+04 2.65e+04\n", "House Age (yrs) -555.3485 1722.179 -0.322 0.768 -6036.090 4925.393\n", "==============================================================================\n", "Omnibus: nan Durbin-Watson: 1.470\n", "Prob(Omnibus): nan Jarque-Bera (JB): 0.523\n", "Skew: -0.080 Prob(JB): 0.770\n", "Kurtosis: 1.670 Cond. No. 3.29e+04\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 3.29e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n", "\n", "VIF Scores after removing high VIF feature:\n", " feature VIF\n", "0 House Age (yrs) 75.623766\n", "\n", "Reduced Model Summary:\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: House Price ($) R-squared: 0.388\n", "Model: OLS Adj. R-squared: 0.265\n", "Method: Least Squares F-statistic: 3.165\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 0.135\n", "Time: 16:34:04 Log-Likelihood: -82.419\n", "No. Observations: 7 AIC: 168.8\n", "Df Residuals: 5 BIC: 168.7\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "===================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "-----------------------------------------------------------------------------------\n", "const 4.039e+05 3.78e+04 10.676 0.000 3.07e+05 5.01e+05\n", "House Age (yrs) -1.25e+04 7025.827 -1.779 0.135 -3.06e+04 5560.463\n", "==============================================================================\n", "Omnibus: nan Durbin-Watson: 2.310\n", "Prob(Omnibus): nan Jarque-Bera (JB): 0.799\n", "Skew: 0.131 Prob(JB): 0.671\n", "Kurtosis: 1.366 Cond. No. 14.9\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\n", "Initial Model R-squared: 0.9861292620673061, Adjusted R-squared: 0.9722585241346122\n", "Reduced Model R-squared: 0.38765822784810133, Adjusted R-squared: 0.2651898734177216\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Caskroom/miniconda/base/envs/accelai/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given.\n", " warn(\"omni_normtest is not valid with less than 8 observations; %i \"\n", "/opt/homebrew/Caskroom/miniconda/base/envs/accelai/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given.\n", " warn(\"omni_normtest is not valid with less than 8 observations; %i \"\n" ] } ], "source": [ "import statsmodels.api as sm\n", "\n", "# Split into features (X) and target (y)\n", "X = data.drop('House Price ($)', axis=1)\n", "y = data['House Price ($)']\n", "\n", "# Add a constant to the features\n", "X_with_const = sm.add_constant(X)\n", "\n", "# Calculate VIF Scores\n", "vif_data = pd.DataFrame()\n", "vif_data[\"feature\"] = X.columns\n", "#vif_data[\"VIF\"] = [variance_inflation_factor(X.values, i+1) for i in range(X.shape[1])]\n", "vif_data[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] # The index should start from 0 for the first column\n", "\n", "print(\"VIF Scores before removing high VIF features:\")\n", "print(vif_data)\n", "\n", "# Initial Linear Regression Model using OLS\n", "model = sm.OLS(y, X_with_const).fit()\n", "\n", "# Model Summary\n", "print(\"Initial Model Summary:\")\n", "print(model.summary())\n", "\n", "# Remove features with high VIF (VIF > 5 is usually considered high)\n", "# Let's assume 'Number of Bedrooms' had a high VIF\n", "X_reduced = X.drop(columns=['Number of Bedrooms','House Size (sq.ft)'])\n", "X_reduced_with_const = sm.add_constant(X_reduced)\n", "\n", "# Recalculate VIF after removing the feature\n", "vif_data_reduced = pd.DataFrame()\n", "vif_data_reduced[\"feature\"] = X_reduced.columns\n", "vif_data_reduced[\"VIF\"] = [variance_inflation_factor(X, i+1) for i in range(X_reduced.shape[1])]\n", "\n", "print(\"\\nVIF Scores after removing high VIF feature:\")\n", "print(vif_data_reduced)\n", "\n", "# Build the reduced model\n", "model_reduced = sm.OLS(y, X_reduced_with_const).fit()\n", "\n", "# Model Summary for reduced model\n", "print(\"\\nReduced Model Summary:\")\n", "print(model_reduced.summary())\n", "\n", "# Compare performance of both models using R-squared and adjusted R-squared\n", "print(f\"\\nInitial Model R-squared: {model.rsquared}, Adjusted R-squared: {model.rsquared_adj}\")\n", "print(f\"Reduced Model R-squared: {model_reduced.rsquared}, Adjusted R-squared: {model_reduced.rsquared_adj}\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "A1bFHBK4_HHf" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "VIF Scores after removing high VIF feature:\n", " feature VIF\n", "0 House Size (sq.ft) 75.623766\n", "1 House Age (yrs) 5.896487\n", "\n", "Reduced Model Summary:\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: House Price ($) R-squared: 0.982\n", "Model: OLS Adj. R-squared: 0.973\n", "Method: Least Squares F-statistic: 111.0\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 0.000313\n", "Time: 16:45:07 Log-Likelihood: -70.016\n", "No. Observations: 7 AIC: 146.0\n", "Df Residuals: 4 BIC: 145.9\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "======================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "--------------------------------------------------------------------------------------\n", "const -3.743e+04 3.87e+04 -0.966 0.389 -1.45e+05 7.01e+04\n", "House Size (sq.ft) 174.7967 15.079 11.592 0.000 132.931 216.663\n", "House Age (yrs) -638.7921 1682.445 -0.380 0.723 -5310.009 4032.425\n", "==============================================================================\n", "Omnibus: nan Durbin-Watson: 1.294\n", "Prob(Omnibus): nan Jarque-Bera (JB): 0.364\n", "Skew: -0.316 Prob(JB): 0.833\n", "Kurtosis: 2.078 Cond. No. 3.19e+04\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 3.19e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Caskroom/miniconda/base/envs/accelai/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given.\n", " warn(\"omni_normtest is not valid with less than 8 observations; %i \"\n" ] } ], "source": [ "# Remove features with high VIF, but keep one of the correlated x (VIF > 5 is usually considered high)\n", "# Let's assume 'Number of Bedrooms' had a high VIF\n", "X_reduced_1 = X.drop(columns=['Number of Bedrooms'])\n", "X_reduced_with_const_1 = sm.add_constant(X_reduced_1)\n", "\n", "# Recalculate VIF after removing the feature\n", "vif_data_reduced_1 = pd.DataFrame()\n", "vif_data_reduced_1[\"feature\"] = X_reduced_1.columns\n", "vif_data_reduced_1[\"VIF\"] = [variance_inflation_factor(X, i+1) for i in range(X_reduced_1.shape[1])]\n", "\n", "print(\"\\nVIF Scores after removing high VIF feature:\")\n", "print(vif_data_reduced_1)\n", "\n", "# Build the reduced model\n", "model_reduced_1 = sm.OLS(y, X_reduced_with_const_1).fit()\n", "\n", "# Model Summary for reduced model\n", "print(\"\\nReduced Model Summary:\")\n", "print(model_reduced_1.summary())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "VIF Scores after removing high VIF feature:\n", " feature VIF\n", "0 Number of Bedrooms 75.623766\n", "1 House Age (yrs) 5.896487\n", "\n", "Reduced Model Summary:\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: House Price ($) R-squared: 0.740\n", "Model: OLS Adj. R-squared: 0.609\n", "Method: Least Squares F-statistic: 5.680\n", "Date: Thu, 19 Sep 2024 Prob (F-statistic): 0.0678\n", "Time: 16:48:45 Log-Likelihood: -79.426\n", "No. Observations: 7 AIC: 164.9\n", "Df Residuals: 4 BIC: 164.7\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "======================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "--------------------------------------------------------------------------------------\n", "const 2.216e+05 8.31e+04 2.666 0.056 -9222.876 4.52e+05\n", "Number of Bedrooms 3.958e+04 1.7e+04 2.325 0.081 -7682.826 8.68e+04\n", "House Age (yrs) -5432.5956 5956.414 -0.912 0.413 -2.2e+04 1.11e+04\n", "==============================================================================\n", "Omnibus: nan Durbin-Watson: 2.121\n", "Prob(Omnibus): nan Jarque-Bera (JB): 0.844\n", "Skew: 0.606 Prob(JB): 0.656\n", "Kurtosis: 1.805 Cond. No. 53.6\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Caskroom/miniconda/base/envs/accelai/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 7 samples were given.\n", " warn(\"omni_normtest is not valid with less than 8 observations; %i \"\n" ] } ], "source": [ "# Remove features with high VIF, but keep one of the correlated x (VIF > 5 is usually considered high)\n", "# Let's assume 'Number of Bedrooms' had a high VIF\n", "X_reduced_2 = X.drop(columns=['House Size (sq.ft)'])\n", "X_reduced_with_const_2 = sm.add_constant(X_reduced_2)\n", "\n", "# Recalculate VIF after removing the feature\n", "vif_data_reduced_2 = pd.DataFrame()\n", "vif_data_reduced_2[\"feature\"] = X_reduced_2.columns\n", "vif_data_reduced_2[\"VIF\"] = [variance_inflation_factor(X, i+1) for i in range(X_reduced_2.shape[1])]\n", "\n", "print(\"\\nVIF Scores after removing high VIF feature:\")\n", "print(vif_data_reduced_2)\n", "\n", "# Build the reduced model\n", "model_reduced_2 = sm.OLS(y, X_reduced_with_const_2).fit()\n", "\n", "# Model Summary for reduced model\n", "print(\"\\nReduced Model Summary:\")\n", "print(model_reduced_2.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 4 }