{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BostonHousing.csv column name exlanation\n", "\n", "* CRIM - per capita crime rate by town\n", "* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.\n", "* INDUS - proportion of non-retail business acres per town.\n", "* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)\n", "* NOX - nitric oxides concentration (parts per 10 million)\n", "* RM - average number of rooms per dwelling\n", "* AGE - proportion of owner-occupied units built prior to 1940\n", "* DIS - weighted distances to five Boston employment centres\n", "* RAD - index of accessibility to radial highways\n", "* TAX - full-value property-tax rate per $10,000\n", "* PTRATIO - pupil-teacher ratio by town\n", "* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", "* LSTAT - % lower status of the population\n", "* MEDV - Median value of owner-occupied homes in $1000's" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dEH8W9IwdmQZ", "outputId": "ef688dd6-8d1c-4c7d-deab-2811c8211f11" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " crim zn indus chas nox rm age dis rad tax ptratio \\\n", "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 \n", "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 \n", "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 \n", "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 \n", "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 \n", "\n", " b lstat medv \n", "0 396.90 4.98 24.0 \n", "1 396.90 9.14 21.6 \n", "2 392.83 4.03 34.7 \n", "3 394.63 2.94 33.4 \n", "4 396.90 5.33 36.2 \n" ] } ], "source": [ "# prompt: read bostonhousing.csv\n", "\n", "import pandas as pd\n", "\n", "# Assuming your CSV file is in your Google Drive's My Drive folder\n", "file_path = 'BostonHousing.csv'\n", "\n", "try:\n", " df = pd.read_csv(file_path)\n", " print(df.head()) # Display the first few rows of the DataFrame\n", "except FileNotFoundError:\n", " print(f\"File not found at: {file_path}\")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "w64J8uyRd9nM" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import train_test_split\n", "import statsmodels.api as sm\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "56EO76-AeLY1" }, "outputs": [], "source": [ "X = df.drop(['medv'], axis=1)\n", "y = pd.Series(df.medv, name='PRICE')\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "m0rOKtaHgqOE" }, "outputs": [], "source": [ "# Step 1: Dummify (One-Hot Encode) the 'RAD' column\n", "# RAD: index of accessibility to radial highways (categorical)\n", "X['rad'] = X['rad'].astype(int) # Ensure it's treated as an integer categorical column\n", "X = pd.get_dummies(X, columns=['rad'], drop_first=True) # Drop first to avoid multicollinearity\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nsH1CqrzhTFv", "outputId": "1a0c18b3-5055-4f3e-c40d-3ee191c531b4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Non-numeric columns: Index(['rad_2', 'rad_3', 'rad_4', 'rad_5', 'rad_6', 'rad_7', 'rad_8',\n", " 'rad_24'],\n", " dtype='object')\n" ] } ], "source": [ "# Check for non-numeric columns\n", "non_numeric_cols = X.select_dtypes(exclude=np.number).columns\n", "print(f\"Non-numeric columns: {non_numeric_cols}\") # Print out any non-numeric columns\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "1avLuccfgzjL", "outputId": "2ead3029-2ce9-4c7b-823a-a367268da456" }, "outputs": [ { "data": { "text/html": [ "
| \n", " | crim | \n", "zn | \n", "indus | \n", "chas | \n", "nox | \n", "rm | \n", "age | \n", "dis | \n", "tax | \n", "ptratio | \n", "b | \n", "lstat | \n", "rad_2 | \n", "rad_3 | \n", "rad_4 | \n", "rad_5 | \n", "rad_6 | \n", "rad_7 | \n", "rad_8 | \n", "rad_24 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.00632 | \n", "18.0 | \n", "2.31 | \n", "0 | \n", "0.538 | \n", "6.575 | \n", "65.2 | \n", "4.0900 | \n", "296 | \n", "15.3 | \n", "396.90 | \n", "4.98 | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "
| 1 | \n", "0.02731 | \n", "0.0 | \n", "7.07 | \n", "0 | \n", "0.469 | \n", "6.421 | \n", "78.9 | \n", "4.9671 | \n", "242 | \n", "17.8 | \n", "396.90 | \n", "9.14 | \n", "True | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "
| 2 | \n", "0.02729 | \n", "0.0 | \n", "7.07 | \n", "0 | \n", "0.469 | \n", "7.185 | \n", "61.1 | \n", "4.9671 | \n", "242 | \n", "17.8 | \n", "392.83 | \n", "4.03 | \n", "True | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "
| 3 | \n", "0.03237 | \n", "0.0 | \n", "2.18 | \n", "0 | \n", "0.458 | \n", "6.998 | \n", "45.8 | \n", "6.0622 | \n", "222 | \n", "18.7 | \n", "394.63 | \n", "2.94 | \n", "False | \n", "True | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "
| 4 | \n", "0.06905 | \n", "0.0 | \n", "2.18 | \n", "0 | \n", "0.458 | \n", "7.147 | \n", "54.2 | \n", "6.0622 | \n", "222 | \n", "18.7 | \n", "396.90 | \n", "5.33 | \n", "False | \n", "True | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "False | \n", "