{ "cells": [ { "cell_type": "code", "execution_count": 16, "id": "8fa6082d-709e-4a87-88c2-b9037c3d0acc", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pycaret.clustering import setup, create_model, plot_model, assign_model, save_model, load_model, predict_model\n", "#biblioteki do znalezienia optymalnej liczby klastrów\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", "from sklearn.metrics import silhouette_score" ] }, { "cell_type": "markdown", "id": "d808d5ae-3402-4677-8ee0-01a25fc47e3d", "metadata": {}, "source": [ "### Załadowanie danych" ] }, { "cell_type": "code", "execution_count": 92, "id": "f3fe4f21-5f7b-43a7-95c0-9647e8379d9e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "edu_level | \n", "fav_animals | \n", "fav_place | \n", "gender | \n", "
---|---|---|---|---|---|
0 | \n", "<18 | \n", "Podstawowe | \n", "Brak ulubionych | \n", "NaN | \n", "Kobieta | \n", "
1 | \n", "25-34 | \n", "Średnie | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "
2 | \n", "45-54 | \n", "Wyższe | \n", "Psy | \n", "W lesie | \n", "Mężczyzna | \n", "
3 | \n", "35-44 | \n", "Średnie | \n", "Koty | \n", "W górach | \n", "Mężczyzna | \n", "
4 | \n", "35-44 | \n", "Wyższe | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "
\n", " | age_num | \n", "edu_level_num | \n", "fav_animals_num | \n", "fav_place_num | \n", "gender_num | \n", "
---|---|---|---|---|---|
0 | \n", "5 | \n", "0 | \n", "0 | \n", "4 | \n", "0 | \n", "
1 | \n", "1 | \n", "2 | \n", "4 | \n", "1 | \n", "1 | \n", "
2 | \n", "3 | \n", "1 | \n", "4 | \n", "3 | \n", "1 | \n", "
3 | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "1 | \n", "
4 | \n", "2 | \n", "1 | \n", "4 | \n", "1 | \n", "1 | \n", "
\n", " | Description | \n", "Value | \n", "
---|---|---|
0 | \n", "Session id | \n", "123 | \n", "
1 | \n", "Original data shape | \n", "(229, 5) | \n", "
2 | \n", "Transformed data shape | \n", "(229, 21) | \n", "
3 | \n", "Categorical features | \n", "5 | \n", "
4 | \n", "Rows with missing values | \n", "13.1% | \n", "
5 | \n", "Preprocess | \n", "True | \n", "
6 | \n", "Imputation type | \n", "simple | \n", "
7 | \n", "Numeric imputation | \n", "mean | \n", "
8 | \n", "Categorical imputation | \n", "mode | \n", "
9 | \n", "Maximum one-hot encoding | \n", "-1 | \n", "
10 | \n", "Encoding method | \n", "None | \n", "
11 | \n", "CPU Jobs | \n", "-1 | \n", "
12 | \n", "Use GPU | \n", "False | \n", "
13 | \n", "Log Experiment | \n", "False | \n", "
14 | \n", "Experiment Name | \n", "cluster-default-name | \n", "
15 | \n", "USI | \n", "6427 | \n", "
\n", " | age | \n", "edu_level | \n", "fav_animals | \n", "fav_place | \n", "gender | \n", "
---|---|---|---|---|---|
0 | \n", "<18 | \n", "Podstawowe | \n", "Brak ulubionych | \n", "NaN | \n", "Kobieta | \n", "
1 | \n", "25-34 | \n", "Średnie | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "
2 | \n", "45-54 | \n", "Wyższe | \n", "Psy | \n", "W lesie | \n", "Mężczyzna | \n", "
3 | \n", "35-44 | \n", "Średnie | \n", "Koty | \n", "W górach | \n", "Mężczyzna | \n", "
4 | \n", "35-44 | \n", "Wyższe | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "
\n", " | age_<18 | \n", "age_25-34 | \n", "age_45-54 | \n", "age_35-44 | \n", "age_18-24 | \n", "age_>=65 | \n", "age_55-64 | \n", "age_unknown | \n", "edu_level_Podstawowe | \n", "edu_level_Średnie | \n", "... | \n", "fav_animals_Brak ulubionych | \n", "fav_animals_Psy | \n", "fav_animals_Koty | \n", "fav_animals_Inne | \n", "fav_animals_Koty i Psy | \n", "fav_place_Nad wodą | \n", "fav_place_W lesie | \n", "fav_place_W górach | \n", "fav_place_Inne | \n", "gender | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
2 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
3 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "
4 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
5 rows × 21 columns
\n", "\n", " | Silhouette | \n", "Calinski-Harabasz | \n", "Davies-Bouldin | \n", "Homogeneity | \n", "Rand Index | \n", "Completeness | \n", "
---|---|---|---|---|---|---|
0 | \n", "0.1953 | \n", "25.9036 | \n", "1.8584 | \n", "0 | \n", "0 | \n", "0 | \n", "
\n", " | age | \n", "edu_level | \n", "fav_animals | \n", "fav_place | \n", "gender | \n", "Cluster | \n", "
---|---|---|---|---|---|---|
0 | \n", "<18 | \n", "Podstawowe | \n", "Brak ulubionych | \n", "NaN | \n", "Kobieta | \n", "Cluster 5 | \n", "
1 | \n", "25-34 | \n", "Średnie | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "Cluster 7 | \n", "
2 | \n", "45-54 | \n", "Wyższe | \n", "Psy | \n", "W lesie | \n", "Mężczyzna | \n", "Cluster 3 | \n", "
3 | \n", "35-44 | \n", "Średnie | \n", "Koty | \n", "W górach | \n", "Mężczyzna | \n", "Cluster 4 | \n", "
4 | \n", "35-44 | \n", "Wyższe | \n", "Psy | \n", "Nad wodą | \n", "Mężczyzna | \n", "Cluster 0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
224 | \n", "35-44 | \n", "Wyższe | \n", "Koty | \n", "Inne | \n", "Kobieta | \n", "Cluster 2 | \n", "
225 | \n", "45-54 | \n", "Wyższe | \n", "Inne | \n", "W lesie | \n", "Mężczyzna | \n", "Cluster 3 | \n", "
226 | \n", "25-34 | \n", "Wyższe | \n", "Psy | \n", "W górach | \n", "Mężczyzna | \n", "Cluster 1 | \n", "
227 | \n", "35-44 | \n", "Wyższe | \n", "Brak ulubionych | \n", "W górach | \n", "Mężczyzna | \n", "Cluster 2 | \n", "
228 | \n", "45-54 | \n", "Wyższe | \n", "Koty | \n", "Nad wodą | \n", "Mężczyzna | \n", "Cluster 6 | \n", "
229 rows × 6 columns
\n", "Pipeline(memory=FastMemory(location=/var/folders/03/jfcw3rjd6c9_pp9srs29gbxh0000gn/T/joblib),\n", " steps=[('numerical_imputer',\n", " TransformerWrapper(include=[], transformer=SimpleImputer())),\n", " ('categorical_imputer',\n", " TransformerWrapper(include=['age', 'edu_level', 'fav_animals',\n", " 'fav_place', 'gender'],\n", " transformer=SimpleImputer(strategy='most_frequent'))),\n", " ('ordinal_...\n", " mapping=[{'col': 'gender',\n", " 'data_type': dtype('O'),\n", " 'mapping': Kobieta 0\n", "Mężczyzna 1\n", "NaN -1\n", "dtype: int64}]))),\n", " ('onehot_encoding',\n", " TransformerWrapper(include=['age', 'edu_level', 'fav_animals',\n", " 'fav_place'],\n", " transformer=OneHotEncoder(cols=['age',\n", " 'edu_level',\n", " 'fav_animals',\n", " 'fav_place'],\n", " handle_missing='return_nan',\n", " use_cat_names=True))),\n", " ('trained_model', KMeans(random_state=123))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(memory=FastMemory(location=/var/folders/03/jfcw3rjd6c9_pp9srs29gbxh0000gn/T/joblib),\n", " steps=[('numerical_imputer',\n", " TransformerWrapper(include=[], transformer=SimpleImputer())),\n", " ('categorical_imputer',\n", " TransformerWrapper(include=['age', 'edu_level', 'fav_animals',\n", " 'fav_place', 'gender'],\n", " transformer=SimpleImputer(strategy='most_frequent'))),\n", " ('ordinal_...\n", " mapping=[{'col': 'gender',\n", " 'data_type': dtype('O'),\n", " 'mapping': Kobieta 0\n", "Mężczyzna 1\n", "NaN -1\n", "dtype: int64}]))),\n", " ('onehot_encoding',\n", " TransformerWrapper(include=['age', 'edu_level', 'fav_animals',\n", " 'fav_place'],\n", " transformer=OneHotEncoder(cols=['age',\n", " 'edu_level',\n", " 'fav_animals',\n", " 'fav_place'],\n", " handle_missing='return_nan',\n", " use_cat_names=True))),\n", " ('trained_model', KMeans(random_state=123))])
TransformerWrapper(include=[], transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerWrapper(include=['age', 'edu_level', 'fav_animals', 'fav_place',\n", " 'gender'],\n", " transformer=SimpleImputer(strategy='most_frequent'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(include=['gender'],\n", " transformer=OrdinalEncoder(cols=['gender'],\n", " handle_missing='return_nan',\n", " mapping=[{'col': 'gender',\n", " 'data_type': dtype('O'),\n", " 'mapping': Kobieta 0\n", "Mężczyzna 1\n", "NaN -1\n", "dtype: int64}]))
OrdinalEncoder(cols=['gender'], handle_missing='return_nan',\n", " mapping=[{'col': 'gender', 'data_type': dtype('O'),\n", " 'mapping': Kobieta 0\n", "Mężczyzna 1\n", "NaN -1\n", "dtype: int64}])
OrdinalEncoder(cols=['gender'], handle_missing='return_nan',\n", " mapping=[{'col': 'gender', 'data_type': dtype('O'),\n", " 'mapping': Kobieta 0\n", "Mężczyzna 1\n", "NaN -1\n", "dtype: int64}])
TransformerWrapper(include=['age', 'edu_level', 'fav_animals', 'fav_place'],\n", " transformer=OneHotEncoder(cols=['age', 'edu_level',\n", " 'fav_animals', 'fav_place'],\n", " handle_missing='return_nan',\n", " use_cat_names=True))
OneHotEncoder(cols=['age', 'edu_level', 'fav_animals', 'fav_place'],\n", " handle_missing='return_nan', use_cat_names=True)
OneHotEncoder(cols=['age', 'edu_level', 'fav_animals', 'fav_place'],\n", " handle_missing='return_nan', use_cat_names=True)
KMeans(random_state=123)
\n", " | age | \n", "edu_level | \n", "fav_animals | \n", "fav_place | \n", "gender | \n", "
---|---|---|---|---|---|
0 | \n", "25-34 | \n", "Wyższe | \n", "Psy | \n", "W górach | \n", "Mężczyzna | \n", "