{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Some very simple classification using scikit-learn\n", "We are going to predict from transactional data whether customers are loyal. We will do this by training a classifier that will predict whether a given customer will return in 2015." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load relevant modules" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation\n", "Get the data from http://liacs.leidenuniv.nl/~takesfw/DSPM/data/sales.csv and read it with Pandas. The argument `names` provide the column headers." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | saleId | \n", "saleDateTime | \n", "accountName | \n", "coins | \n", "currency | \n", "priceInCurrency | \n", "priceInEUR | \n", "methodId | \n", "ip | \n", "ipCountry | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "22497 | \n", "2010-01-01 08:00:31 | \n", "214001065a105d24da76ad30f | \n", "14000 | \n", "EUR | \n", "10.00 | \n", "10.000 | \n", "40 | \n", "21.213.175.133 | \n", "CY | \n", "
1 | \n", "22499 | \n", "2010-01-01 11:53:24 | \n", "1f9d7ef7422a94de52ada4870 | \n", "35000 | \n", "EUR | \n", "20.00 | \n", "20.000 | \n", "40 | \n", "151.36.162.144 | \n", "CY | \n", "
2 | \n", "22500 | \n", "2010-01-01 11:56:50 | \n", "d459930438f5610a5d915a767 | \n", "14000 | \n", "EUR | \n", "10.00 | \n", "10.000 | \n", "40 | \n", "151.36.162.144 | \n", "CY | \n", "
3 | \n", "22506 | \n", "2010-01-01 13:31:41 | \n", "d35bf11e9d005f64d8027521b | \n", "28000 | \n", "EUR | \n", "20.00 | \n", "20.000 | \n", "40 | \n", "155.196.146.232 | \n", "GR | \n", "
4 | \n", "22507 | \n", "2010-01-01 13:33:54 | \n", "d35bf11e9d005f64d8027521b | \n", "6000 | \n", "EUR | \n", "5.00 | \n", "5.000 | \n", "40 | \n", "155.196.146.232 | \n", "GR | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
747106 | \n", "2559412 | \n", "2015-05-14 03:40:51 | \n", "b158d12fbe85dc2bc50bad52c | \n", "2100 | \n", "EUR | \n", "2.00 | \n", "2.000 | \n", "2000 | \n", "144.76.183.73 | \n", "FR | \n", "
747107 | \n", "2559413 | \n", "2015-05-14 03:41:30 | \n", "b7f401f488a08b823bd7d9c6c | \n", "4000 | \n", "EUR | \n", "4.00 | \n", "4.000 | \n", "2000 | \n", "154.95.162.120 | \n", "GF | \n", "
747108 | \n", "2559425 | \n", "2015-05-14 19:01:32 | \n", "7ba22509dff6afa562d5e73b4 | \n", "1100 | \n", "PLN | \n", "4.99 | \n", "1.228 | \n", "2000 | \n", "1.50.212.16 | \n", "PL | \n", "
747109 | \n", "2559427 | \n", "2015-05-14 23:11:26 | \n", "0b7ce8a6f37f0a19726469ce8 | \n", "4000 | \n", "EUR | \n", "4.00 | \n", "4.000 | \n", "2000 | \n", "218.131.46.249 | \n", "RE | \n", "
747110 | \n", "2559429 | \n", "2015-05-15 09:25:14 | \n", "cc7267f27c95acc46be8b8a8b | \n", "15000 | \n", "EUR | \n", "10.00 | \n", "10.000 | \n", "40 | \n", "21.190.113.186 | \n", "NL | \n", "
747111 rows × 10 columns
\n", "\n", " | coins | \n", "priceInCurrency | \n", "saleId | \n", "
---|---|---|---|
accountName | \n", "\n", " | \n", " | \n", " |
000113b526fbc8d19f03fccb9 | \n", "6100 | \n", "4.666667 | \n", "3 | \n", "
00027408fd5b8ff231814dbf2 | \n", "31000 | \n", "3.598167 | \n", "120 | \n", "
0002c9dd969f5e5831d492524 | \n", "15000 | \n", "8.000000 | \n", "5 | \n", "
0004dd02811925311678f7c85 | \n", "6100 | \n", "4.338000 | \n", "5 | \n", "
00066a451be9a6bc5fda16fd9 | \n", "2100 | \n", "5.617500 | \n", "8 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
fffaa6a39be1ab2f86fd2571f | \n", "31000 | \n", "20.000000 | \n", "1 | \n", "
fffb2efd72b3318b86173d294 | \n", "7200 | \n", "8.000000 | \n", "1 | \n", "
fffcd5cce92731953647fe62f | \n", "17000 | \n", "4.454262 | \n", "122 | \n", "
fffd3e63c839efd5435be9619 | \n", "14000 | \n", "15.000000 | \n", "2 | \n", "
fffdb7a076319ce6d46ec4ed8 | \n", "3800 | \n", "3.797500 | \n", "4 | \n", "
68511 rows × 3 columns
\n", "