{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Trenujemy klasyfikator dla Irysów"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from pycaret.classification import setup as cls_setup, compare_models as cls_compare_models, finalize_model as cls_finalize_model, predict_model as cls_predict_model, plot_model as cls_plot_model, save_model as cls_save_model\n",
"from pycaret.regression import setup as reg_setup, compare_models as reg_compare_models, finalize_model as reg_finalize_model, predict_model as reg_predict_model, plot_model as reg_plot_model, save_model as reg_save_model\n",
"from pycaret.datasets import get_data\n",
"import pandas as pd\n",
"from IPython.display import Markdown, display"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wyświetlenie wszystkich dostępnych zbiorów danych"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Dataset | \n",
" Data Types | \n",
" Default Task | \n",
" Target Variable 1 | \n",
" Target Variable 2 | \n",
" # Instances | \n",
" # Attributes | \n",
" Missing Values | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" anomaly | \n",
" Multivariate | \n",
" Anomaly Detection | \n",
" NaN | \n",
" NaN | \n",
" 1000 | \n",
" 10 | \n",
" N | \n",
"
\n",
" \n",
" 1 | \n",
" france | \n",
" Multivariate | \n",
" Association Rule Mining | \n",
" InvoiceNo | \n",
" Description | \n",
" 8557 | \n",
" 8 | \n",
" N | \n",
"
\n",
" \n",
" 2 | \n",
" germany | \n",
" Multivariate | \n",
" Association Rule Mining | \n",
" InvoiceNo | \n",
" Description | \n",
" 9495 | \n",
" 8 | \n",
" N | \n",
"
\n",
" \n",
" 3 | \n",
" bank | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" deposit | \n",
" NaN | \n",
" 45211 | \n",
" 17 | \n",
" N | \n",
"
\n",
" \n",
" 4 | \n",
" blood | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Class | \n",
" NaN | \n",
" 748 | \n",
" 5 | \n",
" N | \n",
"
\n",
" \n",
" 5 | \n",
" cancer | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Class | \n",
" NaN | \n",
" 683 | \n",
" 10 | \n",
" N | \n",
"
\n",
" \n",
" 6 | \n",
" credit | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" default | \n",
" NaN | \n",
" 24000 | \n",
" 24 | \n",
" N | \n",
"
\n",
" \n",
" 7 | \n",
" diabetes | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Class variable | \n",
" NaN | \n",
" 768 | \n",
" 9 | \n",
" N | \n",
"
\n",
" \n",
" 8 | \n",
" electrical_grid | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" stabf | \n",
" NaN | \n",
" 10000 | \n",
" 14 | \n",
" N | \n",
"
\n",
" \n",
" 9 | \n",
" employee | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" left | \n",
" NaN | \n",
" 14999 | \n",
" 10 | \n",
" N | \n",
"
\n",
" \n",
" 10 | \n",
" heart | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" DEATH | \n",
" NaN | \n",
" 200 | \n",
" 16 | \n",
" N | \n",
"
\n",
" \n",
" 11 | \n",
" heart_disease | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Disease | \n",
" NaN | \n",
" 270 | \n",
" 14 | \n",
" N | \n",
"
\n",
" \n",
" 12 | \n",
" hepatitis | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Class | \n",
" NaN | \n",
" 154 | \n",
" 32 | \n",
" Y | \n",
"
\n",
" \n",
" 13 | \n",
" income | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" income >50K | \n",
" NaN | \n",
" 32561 | \n",
" 14 | \n",
" Y | \n",
"
\n",
" \n",
" 14 | \n",
" juice | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Purchase | \n",
" NaN | \n",
" 1070 | \n",
" 15 | \n",
" N | \n",
"
\n",
" \n",
" 15 | \n",
" nba | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" TARGET_5Yrs | \n",
" NaN | \n",
" 1340 | \n",
" 21 | \n",
" N | \n",
"
\n",
" \n",
" 16 | \n",
" wine | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" type | \n",
" NaN | \n",
" 6498 | \n",
" 13 | \n",
" N | \n",
"
\n",
" \n",
" 17 | \n",
" telescope | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Class | \n",
" NaN | \n",
" 19020 | \n",
" 11 | \n",
" N | \n",
"
\n",
" \n",
" 18 | \n",
" titanic | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" Survived | \n",
" NaN | \n",
" 891 | \n",
" 11 | \n",
" Y | \n",
"
\n",
" \n",
" 19 | \n",
" us_presidential_election_results | \n",
" Multivariate | \n",
" Classification (Binary) | \n",
" party_winner | \n",
" NaN | \n",
" 497 | \n",
" 7 | \n",
" N | \n",
"
\n",
" \n",
" 20 | \n",
" glass | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" Type | \n",
" NaN | \n",
" 214 | \n",
" 10 | \n",
" N | \n",
"
\n",
" \n",
" 21 | \n",
" iris | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" species | \n",
" NaN | \n",
" 150 | \n",
" 5 | \n",
" N | \n",
"
\n",
" \n",
" 22 | \n",
" poker | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" CLASS | \n",
" NaN | \n",
" 100000 | \n",
" 11 | \n",
" N | \n",
"
\n",
" \n",
" 23 | \n",
" questions | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" Next_Question | \n",
" NaN | \n",
" 499 | \n",
" 4 | \n",
" N | \n",
"
\n",
" \n",
" 24 | \n",
" satellite | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" Class | \n",
" NaN | \n",
" 6435 | \n",
" 37 | \n",
" N | \n",
"
\n",
" \n",
" 25 | \n",
" CTG | \n",
" Multivariate | \n",
" Classification (Multiclass) | \n",
" NSP | \n",
" NaN | \n",
" 2129 | \n",
" 40 | \n",
" Y | \n",
"
\n",
" \n",
" 26 | \n",
" asia_gdp | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 40 | \n",
" 11 | \n",
" N | \n",
"
\n",
" \n",
" 27 | \n",
" elections | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 3195 | \n",
" 54 | \n",
" Y | \n",
"
\n",
" \n",
" 28 | \n",
" facebook | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 7050 | \n",
" 12 | \n",
" N | \n",
"
\n",
" \n",
" 29 | \n",
" ipl | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 153 | \n",
" 25 | \n",
" N | \n",
"
\n",
" \n",
" 30 | \n",
" jewellery | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 505 | \n",
" 4 | \n",
" N | \n",
"
\n",
" \n",
" 31 | \n",
" mice | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 1080 | \n",
" 82 | \n",
" Y | \n",
"
\n",
" \n",
" 32 | \n",
" migration | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 233 | \n",
" 12 | \n",
" N | \n",
"
\n",
" \n",
" 33 | \n",
" perfume | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 20 | \n",
" 29 | \n",
" N | \n",
"
\n",
" \n",
" 34 | \n",
" pokemon | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 800 | \n",
" 13 | \n",
" Y | \n",
"
\n",
" \n",
" 35 | \n",
" population | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 255 | \n",
" 56 | \n",
" Y | \n",
"
\n",
" \n",
" 36 | \n",
" public_health | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 224 | \n",
" 21 | \n",
" N | \n",
"
\n",
" \n",
" 37 | \n",
" seeds | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 210 | \n",
" 7 | \n",
" N | \n",
"
\n",
" \n",
" 38 | \n",
" wholesale | \n",
" Multivariate | \n",
" Clustering | \n",
" NaN | \n",
" NaN | \n",
" 440 | \n",
" 8 | \n",
" N | \n",
"
\n",
" \n",
" 39 | \n",
" tweets | \n",
" Text | \n",
" NLP | \n",
" tweet | \n",
" NaN | \n",
" 8594 | \n",
" 2 | \n",
" N | \n",
"
\n",
" \n",
" 40 | \n",
" amazon | \n",
" Text | \n",
" NLP / Classification | \n",
" reviewText | \n",
" NaN | \n",
" 20000 | \n",
" 2 | \n",
" N | \n",
"
\n",
" \n",
" 41 | \n",
" kiva | \n",
" Text | \n",
" NLP / Classification | \n",
" en | \n",
" NaN | \n",
" 6818 | \n",
" 7 | \n",
" N | \n",
"
\n",
" \n",
" 42 | \n",
" spx | \n",
" Text | \n",
" NLP / Regression | \n",
" text | \n",
" NaN | \n",
" 874 | \n",
" 4 | \n",
" N | \n",
"
\n",
" \n",
" 43 | \n",
" wikipedia | \n",
" Text | \n",
" NLP / Classification | \n",
" Text | \n",
" NaN | \n",
" 500 | \n",
" 3 | \n",
" N | \n",
"
\n",
" \n",
" 44 | \n",
" automobile | \n",
" Multivariate | \n",
" Regression | \n",
" price | \n",
" NaN | \n",
" 202 | \n",
" 26 | \n",
" Y | \n",
"
\n",
" \n",
" 45 | \n",
" bike | \n",
" Multivariate | \n",
" Regression | \n",
" cnt | \n",
" NaN | \n",
" 17379 | \n",
" 15 | \n",
" N | \n",
"
\n",
" \n",
" 46 | \n",
" boston | \n",
" Multivariate | \n",
" Regression | \n",
" medv | \n",
" NaN | \n",
" 506 | \n",
" 14 | \n",
" N | \n",
"
\n",
" \n",
" 47 | \n",
" concrete | \n",
" Multivariate | \n",
" Regression | \n",
" strength | \n",
" NaN | \n",
" 1030 | \n",
" 9 | \n",
" N | \n",
"
\n",
" \n",
" 48 | \n",
" diamond | \n",
" Multivariate | \n",
" Regression | \n",
" Price | \n",
" NaN | \n",
" 6000 | \n",
" 8 | \n",
" N | \n",
"
\n",
" \n",
" 49 | \n",
" energy | \n",
" Multivariate | \n",
" Regression | \n",
" Heating Load | \n",
" Cooling Load | \n",
" 768 | \n",
" 10 | \n",
" N | \n",
"
\n",
" \n",
" 50 | \n",
" forest | \n",
" Multivariate | \n",
" Regression | \n",
" area | \n",
" NaN | \n",
" 517 | \n",
" 13 | \n",
" N | \n",
"
\n",
" \n",
" 51 | \n",
" gold | \n",
" Multivariate | \n",
" Regression | \n",
" Gold_T+22 | \n",
" NaN | \n",
" 2558 | \n",
" 121 | \n",
" N | \n",
"
\n",
" \n",
" 52 | \n",
" house | \n",
" Multivariate | \n",
" Regression | \n",
" SalePrice | \n",
" NaN | \n",
" 1461 | \n",
" 81 | \n",
" Y | \n",
"
\n",
" \n",
" 53 | \n",
" insurance | \n",
" Multivariate | \n",
" Regression | \n",
" charges | \n",
" NaN | \n",
" 1338 | \n",
" 7 | \n",
" N | \n",
"
\n",
" \n",
" 54 | \n",
" parkinsons | \n",
" Multivariate | \n",
" Regression | \n",
" PPE | \n",
" NaN | \n",
" 5875 | \n",
" 22 | \n",
" N | \n",
"
\n",
" \n",
" 55 | \n",
" traffic | \n",
" Multivariate | \n",
" Regression | \n",
" traffic_volume | \n",
" NaN | \n",
" 48204 | \n",
" 8 | \n",
" N | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Dataset Data Types \\\n",
"0 anomaly Multivariate \n",
"1 france Multivariate \n",
"2 germany Multivariate \n",
"3 bank Multivariate \n",
"4 blood Multivariate \n",
"5 cancer Multivariate \n",
"6 credit Multivariate \n",
"7 diabetes Multivariate \n",
"8 electrical_grid Multivariate \n",
"9 employee Multivariate \n",
"10 heart Multivariate \n",
"11 heart_disease Multivariate \n",
"12 hepatitis Multivariate \n",
"13 income Multivariate \n",
"14 juice Multivariate \n",
"15 nba Multivariate \n",
"16 wine Multivariate \n",
"17 telescope Multivariate \n",
"18 titanic Multivariate \n",
"19 us_presidential_election_results Multivariate \n",
"20 glass Multivariate \n",
"21 iris Multivariate \n",
"22 poker Multivariate \n",
"23 questions Multivariate \n",
"24 satellite Multivariate \n",
"25 CTG Multivariate \n",
"26 asia_gdp Multivariate \n",
"27 elections Multivariate \n",
"28 facebook Multivariate \n",
"29 ipl Multivariate \n",
"30 jewellery Multivariate \n",
"31 mice Multivariate \n",
"32 migration Multivariate \n",
"33 perfume Multivariate \n",
"34 pokemon Multivariate \n",
"35 population Multivariate \n",
"36 public_health Multivariate \n",
"37 seeds Multivariate \n",
"38 wholesale Multivariate \n",
"39 tweets Text \n",
"40 amazon Text \n",
"41 kiva Text \n",
"42 spx Text \n",
"43 wikipedia Text \n",
"44 automobile Multivariate \n",
"45 bike Multivariate \n",
"46 boston Multivariate \n",
"47 concrete Multivariate \n",
"48 diamond Multivariate \n",
"49 energy Multivariate \n",
"50 forest Multivariate \n",
"51 gold Multivariate \n",
"52 house Multivariate \n",
"53 insurance Multivariate \n",
"54 parkinsons Multivariate \n",
"55 traffic Multivariate \n",
"\n",
" Default Task Target Variable 1 Target Variable 2 \\\n",
"0 Anomaly Detection NaN NaN \n",
"1 Association Rule Mining InvoiceNo Description \n",
"2 Association Rule Mining InvoiceNo Description \n",
"3 Classification (Binary) deposit NaN \n",
"4 Classification (Binary) Class NaN \n",
"5 Classification (Binary) Class NaN \n",
"6 Classification (Binary) default NaN \n",
"7 Classification (Binary) Class variable NaN \n",
"8 Classification (Binary) stabf NaN \n",
"9 Classification (Binary) left NaN \n",
"10 Classification (Binary) DEATH NaN \n",
"11 Classification (Binary) Disease NaN \n",
"12 Classification (Binary) Class NaN \n",
"13 Classification (Binary) income >50K NaN \n",
"14 Classification (Binary) Purchase NaN \n",
"15 Classification (Binary) TARGET_5Yrs NaN \n",
"16 Classification (Binary) type NaN \n",
"17 Classification (Binary) Class NaN \n",
"18 Classification (Binary) Survived NaN \n",
"19 Classification (Binary) party_winner NaN \n",
"20 Classification (Multiclass) Type NaN \n",
"21 Classification (Multiclass) species NaN \n",
"22 Classification (Multiclass) CLASS NaN \n",
"23 Classification (Multiclass) Next_Question NaN \n",
"24 Classification (Multiclass) Class NaN \n",
"25 Classification (Multiclass) NSP NaN \n",
"26 Clustering NaN NaN \n",
"27 Clustering NaN NaN \n",
"28 Clustering NaN NaN \n",
"29 Clustering NaN NaN \n",
"30 Clustering NaN NaN \n",
"31 Clustering NaN NaN \n",
"32 Clustering NaN NaN \n",
"33 Clustering NaN NaN \n",
"34 Clustering NaN NaN \n",
"35 Clustering NaN NaN \n",
"36 Clustering NaN NaN \n",
"37 Clustering NaN NaN \n",
"38 Clustering NaN NaN \n",
"39 NLP tweet NaN \n",
"40 NLP / Classification reviewText NaN \n",
"41 NLP / Classification en NaN \n",
"42 NLP / Regression text NaN \n",
"43 NLP / Classification Text NaN \n",
"44 Regression price NaN \n",
"45 Regression cnt NaN \n",
"46 Regression medv NaN \n",
"47 Regression strength NaN \n",
"48 Regression Price NaN \n",
"49 Regression Heating Load Cooling Load \n",
"50 Regression area NaN \n",
"51 Regression Gold_T+22 NaN \n",
"52 Regression SalePrice NaN \n",
"53 Regression charges NaN \n",
"54 Regression PPE NaN \n",
"55 Regression traffic_volume NaN \n",
"\n",
" # Instances # Attributes Missing Values \n",
"0 1000 10 N \n",
"1 8557 8 N \n",
"2 9495 8 N \n",
"3 45211 17 N \n",
"4 748 5 N \n",
"5 683 10 N \n",
"6 24000 24 N \n",
"7 768 9 N \n",
"8 10000 14 N \n",
"9 14999 10 N \n",
"10 200 16 N \n",
"11 270 14 N \n",
"12 154 32 Y \n",
"13 32561 14 Y \n",
"14 1070 15 N \n",
"15 1340 21 N \n",
"16 6498 13 N \n",
"17 19020 11 N \n",
"18 891 11 Y \n",
"19 497 7 N \n",
"20 214 10 N \n",
"21 150 5 N \n",
"22 100000 11 N \n",
"23 499 4 N \n",
"24 6435 37 N \n",
"25 2129 40 Y \n",
"26 40 11 N \n",
"27 3195 54 Y \n",
"28 7050 12 N \n",
"29 153 25 N \n",
"30 505 4 N \n",
"31 1080 82 Y \n",
"32 233 12 N \n",
"33 20 29 N \n",
"34 800 13 Y \n",
"35 255 56 Y \n",
"36 224 21 N \n",
"37 210 7 N \n",
"38 440 8 N \n",
"39 8594 2 N \n",
"40 20000 2 N \n",
"41 6818 7 N \n",
"42 874 4 N \n",
"43 500 3 N \n",
"44 202 26 Y \n",
"45 17379 15 N \n",
"46 506 14 N \n",
"47 1030 9 N \n",
"48 6000 8 N \n",
"49 768 10 N \n",
"50 517 13 N \n",
"51 2558 121 N \n",
"52 1461 81 Y \n",
"53 1338 7 N \n",
"54 5875 22 N \n",
"55 48204 8 N "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_df = get_data('index', verbose=False)\n",
"dataset_df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" Braund, Mr. Owen Harris | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" A/5 21171 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" PC 17599 | \n",
" 71.2833 | \n",
" C85 | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 3 | \n",
" Heikkinen, Miss. Laina | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O2. 3101282 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 113803 | \n",
" 53.1000 | \n",
" C123 | \n",
" S | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 3 | \n",
" Allen, Mr. William Henry | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 373450 | \n",
" 8.0500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = get_data('titanic')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ilość rekordów"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"891"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Brakujące wartości"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.22446689113355783"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Embarked'].isnull().sum() / len(df['Embarked']) *100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"## Mamy do czynienia z problemem: **Klasyfikacji**"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"target_column = 'Survived'\n",
"cls_setup(data=df, target=target_column, session_id=123, verbose=False)\n",
"#reg_setup(data=df, target='petal_width', session_id=123, verbose=False)\n",
"\n",
"# Sprawdzenie czy to model: REGRESJI czy KLASYFIKACJI\n",
"if target_column:\n",
" if (pd.api.types.is_numeric_dtype(df[target_column])) and (df[target_column].nunique() > 10):\n",
" display(Markdown('## Mamy do czynienia z problemem: **Regresji**'))\n",
" else:\n",
" display(Markdown('## Mamy do czynienia z problemem: **Klasyfikacji**'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare Models"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Model | \n",
" Accuracy | \n",
" AUC | \n",
" Recall | \n",
" Prec. | \n",
" F1 | \n",
" Kappa | \n",
" MCC | \n",
" TT (Sec) | \n",
"
\n",
" \n",
" \n",
" \n",
" lr | \n",
" Logistic Regression | \n",
" 0.8056 | \n",
" 0.8692 | \n",
" 0.6739 | \n",
" 0.7883 | \n",
" 0.7228 | \n",
" 0.5758 | \n",
" 0.5825 | \n",
" 1.7720 | \n",
"
\n",
" \n",
" ridge | \n",
" Ridge Classifier | \n",
" 0.7528 | \n",
" 0.8647 | \n",
" 0.4522 | \n",
" 0.8257 | \n",
" 0.5793 | \n",
" 0.4273 | \n",
" 0.4679 | \n",
" 0.0700 | \n",
"
\n",
" \n",
" et | \n",
" Extra Trees Classifier | \n",
" 0.7400 | \n",
" 0.0000 | \n",
" 0.4774 | \n",
" 0.7654 | \n",
" 0.5815 | \n",
" 0.4088 | \n",
" 0.4356 | \n",
" 0.1520 | \n",
"
\n",
" \n",
" lda | \n",
" Linear Discriminant Analysis | \n",
" 0.6260 | \n",
" 0.5382 | \n",
" 0.0348 | \n",
" 0.0800 | \n",
" 0.0485 | \n",
" 0.0335 | \n",
" 0.0389 | \n",
" 0.0650 | \n",
"
\n",
" \n",
" dt | \n",
" Decision Tree Classifier | \n",
" 0.6164 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0890 | \n",
"
\n",
" \n",
" rf | \n",
" Random Forest Classifier | \n",
" 0.6164 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.1780 | \n",
"
\n",
" \n",
" qda | \n",
" Quadratic Discriminant Analysis | \n",
" 0.6164 | \n",
" 0.5166 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0700 | \n",
"
\n",
" \n",
" ada | \n",
" Ada Boost Classifier | \n",
" 0.6164 | \n",
" 0.5000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0690 | \n",
"
\n",
" \n",
" gbc | \n",
" Gradient Boosting Classifier | \n",
" 0.6164 | \n",
" 0.5000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.1240 | \n",
"
\n",
" \n",
" lightgbm | \n",
" Light Gradient Boosting Machine | \n",
" 0.6164 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0930 | \n",
"
\n",
" \n",
" dummy | \n",
" Dummy Classifier | \n",
" 0.6164 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0000 | \n",
" 0.0750 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d7df699944cc45f7a7031b2299d31331",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/45 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = cls_compare_models(exclude=['knn', 'svm', 'gpc', 'nb'])\n",
"#best_model = reg_compare_models(exclude=['knn', 'svm', 'gpr', 'et', 'en'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate Models"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cls_plot_model(best_model, plot='feature')\n",
"#reg_plot_model(best_model, plot='feature')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Finalize Model and Save"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Transformation Pipeline and Model Successfully Saved\n"
]
},
{
"data": {
"text/plain": [
"(Pipeline(memory=Memory(location=None),\n",
" steps=[('label_encoding',\n",
" TransformerWrapperWithInverse(exclude=None, include=None,\n",
" transformer=LabelEncoder())),\n",
" ('numerical_imputer',\n",
" TransformerWrapper(exclude=None,\n",
" include=['sepal_length', 'sepal_width',\n",
" 'petal_length', 'petal_width'],\n",
" transformer=SimpleImputer(add_indicator=False,\n",
" copy=True,\n",
" fill_value=None,\n",
" keep_empt...\n",
" fill_value=None,\n",
" keep_empty_features=False,\n",
" missing_values=nan,\n",
" strategy='most_frequent'))),\n",
" ('actual_estimator',\n",
" LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
" fit_intercept=True, intercept_scaling=1,\n",
" l1_ratio=None, max_iter=1000,\n",
" multi_class='auto', n_jobs=None,\n",
" penalty='l2', random_state=123,\n",
" solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False))],\n",
" verbose=False),\n",
" 'iris_classification_pipeline.pkl')"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_model = cls_finalize_model(best_model)\n",
"#final_model = reg_finalize_model(best_model)\n",
"cls_save_model(final_model, 'iris_classification_pipeline')\n",
"#reg_save_model(final_model, 'iris_regression_pipeline')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}