{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regressione"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Image\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet\n",
"from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, KFold, LeaveOneOut\n",
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
"from sklearn.feature_selection import mutual_info_regression\n",
"from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC, RidgeCV\n",
"import seaborn as sns\n",
"import copy"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib.colors as mcolors\n",
"from matplotlib import cm\n",
"\n",
"plt.style.use('fivethirtyeight')\n",
"\n",
"plt.rcParams['font.family'] = 'sans-serif'\n",
"plt.rcParams['font.serif'] = 'Ubuntu'\n",
"plt.rcParams['font.monospace'] = 'Ubuntu Mono'\n",
"plt.rcParams['font.size'] = 10\n",
"plt.rcParams['axes.labelsize'] = 10\n",
"plt.rcParams['axes.labelweight'] = 'bold'\n",
"plt.rcParams['axes.titlesize'] = 10\n",
"plt.rcParams['xtick.labelsize'] = 8\n",
"plt.rcParams['ytick.labelsize'] = 8\n",
"plt.rcParams['legend.fontsize'] = 10\n",
"plt.rcParams['figure.titlesize'] = 12\n",
"plt.rcParams['image.cmap'] = 'jet'\n",
"plt.rcParams['image.interpolation'] = 'none'\n",
"plt.rcParams['figure.figsize'] = (16, 8)\n",
"plt.rcParams['lines.linewidth'] = 2\n",
"plt.rcParams['lines.markersize'] = 8\n",
"\n",
"colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd:goldenrod', 'xkcd:cadet blue', \n",
" 'xkcd:scarlet']\n",
"cmap_big = cm.get_cmap('Spectral', 512)\n",
"cmap = mcolors.ListedColormap(cmap_big(np.linspace(0.7, 0.95, 256)))\n",
"\n",
"bbox_props = dict(boxstyle=\"round,pad=0.3\", fc=colors[0], alpha=.5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Esame del dataset Housing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Features:\n",
" \n",
"
\n", "1. CRIM per capita crime rate by town\n", "2. ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", "3. INDUS proportion of non-retail business acres per town\n", "4. CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", "5. NOX nitric oxides concentration (parts per 10 million)\n", "6. RM average number of rooms per dwelling\n", "7. AGE proportion of owner-occupied units built prior to 1940\n", "8. DIS weighted distances to five Boston employment centres\n", "9. RAD index of accessibility to radial highways\n", "10. TAX full-value property-tax rate per $10,000\n", "11. PTRATIO pupil-teacher ratio by town\n", "12. B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", "13. LSTAT % lower status of the population\n", "14. MEDV Median value of owner-occupied homes in $1000s\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lettura del dataset in dataframe pandas" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "\n", "filepath = \"../dataset/\"\n", "url = \"https://tvml.github.io/ml2324/dataset/\"\n", "\n", "def get_file(filename):\n", " IS_COLAB = ('google.colab' in str(get_ipython()))\n", " if IS_COLAB:\n", " urllib.request.urlretrieve (url+filename, filename)\n", " return filename\n", " else:\n", " return filepath+filename" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(506, 14)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(get_file('housing.data.txt'), header=None, sep='\\s+')\n", "df.columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualizzazione delle caratteristiche del dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Matrice delle distribuzioni mutue delle feature. Sulla diagonale, distribuzione delle singole feature" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "
\n", " | mi | \n", "
---|---|
LSTAT | \n", "0.666217 | \n", "
RM | \n", "0.529658 | \n", "
INDUS | \n", "0.472124 | \n", "
NOX | \n", "0.457762 | \n", "
PTRATIO | \n", "0.432389 | \n", "
TAX | \n", "0.363236 | \n", "
CRIM | \n", "0.350990 | \n", "
AGE | \n", "0.314053 | \n", "
DIS | \n", "0.296431 | \n", "
RAD | \n", "0.220153 | \n", "
ZN | \n", "0.197626 | \n", "
B | \n", "0.164009 | \n", "
CHAS | \n", "0.028975 | \n", "