{ "cells": [ { "cell_type": "code", "execution_count": 15, "id": "86db5074-69b4-4293-b91d-e5602693a4c8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "id": "7df6489a-450c-4733-90b4-914cd9417776", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
crimzninduschasnoxrmagedisradtaxptratioblstatmedv
00.0063218.02.3100.5386.57565.24.0900129615.3396.904.9824.0
10.027310.07.0700.4696.42178.94.9671224217.8396.909.1421.6
20.027290.07.0700.4697.18561.14.9671224217.8392.834.0334.7
30.032370.02.1800.4586.99845.86.0622322218.7394.632.9433.4
\n", "
" ], "text/plain": [ " crim zn indus chas nox rm age dis rad tax ptratio \\\n", "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 \n", "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 \n", "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 \n", "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 \n", "\n", " b lstat medv \n", "0 396.90 4.98 24.0 \n", "1 396.90 9.14 21.6 \n", "2 392.83 4.03 34.7 \n", "3 394.63 2.94 33.4 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read the dataset\n", "housing = pd.read_csv(\"BostonHousing.csv\")\n", "housing.head(4)" ] }, { "cell_type": "code", "execution_count": 5, "id": "f67edb0e-49b1-44ef-936a-603f5a4a000b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 506 entries, 0 to 505\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 crim 506 non-null float64\n", " 1 zn 506 non-null float64\n", " 2 indus 506 non-null float64\n", " 3 chas 506 non-null int64 \n", " 4 nox 506 non-null float64\n", " 5 rm 506 non-null float64\n", " 6 age 506 non-null float64\n", " 7 dis 506 non-null float64\n", " 8 rad 506 non-null int64 \n", " 9 tax 506 non-null int64 \n", " 10 ptratio 506 non-null float64\n", " 11 b 506 non-null float64\n", " 12 lstat 506 non-null float64\n", " 13 medv 506 non-null float64\n", "dtypes: float64(11), int64(3)\n", "memory usage: 55.5 KB\n" ] } ], "source": [ "housing.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "c64a9ad0-186b-4a17-9f40-adcafbccd4fb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
crimzninduschasnoxrmagedisradtaxptratioblstatmedv
count506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000
mean3.61352411.36363611.1367790.0691700.5546956.28463468.5749013.7950439.549407408.23715418.455534356.67403212.65306322.532806
std8.60154523.3224536.8603530.2539940.1158780.70261728.1488612.1057108.707259168.5371162.16494691.2948647.1410629.197104
min0.0063200.0000000.4600000.0000000.3850003.5610002.9000001.1296001.000000187.00000012.6000000.3200001.7300005.000000
25%0.0820450.0000005.1900000.0000000.4490005.88550045.0250002.1001754.000000279.00000017.400000375.3775006.95000017.025000
50%0.2565100.0000009.6900000.0000000.5380006.20850077.5000003.2074505.000000330.00000019.050000391.44000011.36000021.200000
75%3.67708312.50000018.1000000.0000000.6240006.62350094.0750005.18842524.000000666.00000020.200000396.22500016.95500025.000000
max88.976200100.00000027.7400001.0000000.8710008.780000100.00000012.12650024.000000711.00000022.000000396.90000037.97000050.000000
\n", "
" ], "text/plain": [ " crim zn indus chas nox rm \\\n", "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", "mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 \n", "std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 \n", "min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 \n", "25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 \n", "50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 \n", "75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 \n", "max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 \n", "\n", " age dis rad tax ptratio b \\\n", "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", "mean 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 \n", "std 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 \n", "min 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 \n", "25% 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 \n", "50% 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 \n", "75% 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 \n", "max 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 \n", "\n", " lstat medv \n", "count 506.000000 506.000000 \n", "mean 12.653063 22.532806 \n", "std 7.141062 9.197104 \n", "min 1.730000 5.000000 \n", "25% 6.950000 17.025000 \n", "50% 11.360000 21.200000 \n", "75% 16.955000 25.000000 \n", "max 37.970000 50.000000 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "housing.describe()" ] }, { "cell_type": "code", "execution_count": 10, "id": "c5e600b6-caed-488c-87b8-100a0978640f", "metadata": {}, "outputs": [], "source": [ "# set the dependent variable and features\n", "X = housing.drop(['medv'], axis=1)\n", "y = pd.Series(housing.medv, name='PRICE')" ] }, { "cell_type": "code", "execution_count": 11, "id": "7874f7c4-8e9d-4639-b388-07ce39d0ea02", "metadata": {}, "outputs": [], "source": [ "# Dummify (One-Hot Encode)\n", "\n", "# RAD: index of accessibility to radial highways (categorical)\n", "X['rad'] = X['rad'].astype(int) # Ensure it's treated as an integer categorical column\n", "X = pd.get_dummies(X, columns=['rad'], drop_first=True) # Drop first to avoid multicollinearity" ] }, { "cell_type": "code", "execution_count": 12, "id": "92999cfc-dd5c-4f5a-9850-b6e6196ca5d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 506 entries, 0 to 505\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 crim 506 non-null float64\n", " 1 zn 506 non-null float64\n", " 2 indus 506 non-null float64\n", " 3 chas 506 non-null int64 \n", " 4 nox 506 non-null float64\n", " 5 rm 506 non-null float64\n", " 6 age 506 non-null float64\n", " 7 dis 506 non-null float64\n", " 8 tax 506 non-null int64 \n", " 9 ptratio 506 non-null float64\n", " 10 b 506 non-null float64\n", " 11 lstat 506 non-null float64\n", " 12 rad_2 506 non-null bool \n", " 13 rad_3 506 non-null bool \n", " 14 rad_4 506 non-null bool \n", " 15 rad_5 506 non-null bool \n", " 16 rad_6 506 non-null bool \n", " 17 rad_7 506 non-null bool \n", " 18 rad_8 506 non-null bool \n", " 19 rad_24 506 non-null bool \n", "dtypes: bool(8), float64(10), int64(2)\n", "memory usage: 51.5 KB\n" ] } ], "source": [ "X.info()" ] }, { "cell_type": "code", "execution_count": 16, "id": "155aee64-6347-4728-bba4-dbf367f1a6be", "metadata": {}, "outputs": [], "source": [ "# Check for non-numeric columns\n", "non_numeric_cols = X.select_dtypes(exclude=np.number).columns\n", "# Assuming 'X' is your DataFrame and 'non_numeric_cols' contains the non-numeric columns\n", "for col in non_numeric_cols:\n", " X[col] = X[col].astype(int)" ] }, { "cell_type": "code", "execution_count": 17, "id": "02890fe7-6663-4c16-972e-09bdd90d5f8e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
crimzninduschasnoxrmagedistaxptratioblstatrad_2rad_3rad_4rad_5rad_6rad_7rad_8rad_24
count506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000
mean3.61352411.36363611.1367790.0691700.5546956.28463468.5749013.795043408.23715418.455534356.67403212.6530630.0474310.0750990.2173910.2272730.0513830.0335970.0474310.260870
std8.60154523.3224536.8603530.2539940.1158780.70261728.1488612.105710168.5371162.16494691.2948647.1410620.2127690.2638120.4128790.4194850.2209970.1803670.2127690.439543
min0.0063200.0000000.4600000.0000000.3850003.5610002.9000001.129600187.00000012.6000000.3200001.7300000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.0820450.0000005.1900000.0000000.4490005.88550045.0250002.100175279.00000017.400000375.3775006.9500000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
50%0.2565100.0000009.6900000.0000000.5380006.20850077.5000003.207450330.00000019.050000391.44000011.3600000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
75%3.67708312.50000018.1000000.0000000.6240006.62350094.0750005.188425666.00000020.200000396.22500016.9550000.0000000.0000000.0000000.0000000.0000000.0000000.0000001.000000
max88.976200100.00000027.7400001.0000000.8710008.780000100.00000012.126500711.00000022.000000396.90000037.9700001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " crim zn indus chas nox rm \\\n", "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", "mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 \n", "std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 \n", "min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 \n", "25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 \n", "50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 \n", "75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 \n", "max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 \n", "\n", " age dis tax ptratio b lstat \\\n", "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", "mean 68.574901 3.795043 408.237154 18.455534 356.674032 12.653063 \n", "std 28.148861 2.105710 168.537116 2.164946 91.294864 7.141062 \n", "min 2.900000 1.129600 187.000000 12.600000 0.320000 1.730000 \n", "25% 45.025000 2.100175 279.000000 17.400000 375.377500 6.950000 \n", "50% 77.500000 3.207450 330.000000 19.050000 391.440000 11.360000 \n", "75% 94.075000 5.188425 666.000000 20.200000 396.225000 16.955000 \n", "max 100.000000 12.126500 711.000000 22.000000 396.900000 37.970000 \n", "\n", " rad_2 rad_3 rad_4 rad_5 rad_6 rad_7 \\\n", "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", "mean 0.047431 0.075099 0.217391 0.227273 0.051383 0.033597 \n", "std 0.212769 0.263812 0.412879 0.419485 0.220997 0.180367 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " rad_8 rad_24 \n", "count 506.000000 506.000000 \n", "mean 0.047431 0.260870 \n", "std 0.212769 0.439543 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", "75% 0.000000 1.000000 \n", "max 1.000000 1.000000 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "77ef5be5-9318-47ef-b5d7-f9621683efd0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 5 }