03 What factors determine the shipping_cost amount?¶
Question and problem definition¶
Among all the factors we have the access, what deternine the shipping_cost amount?
In [35]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
Acquire deta¶
In [30]:
train_df = pd.read_csv(‘orders.csv’)
test_df = pd.read_csv(‘users.csv’)
comebine = [train_df, test_df]
Analyze by decribeing data¶
In [20]:
print(train_df.columns.values)
[‘id’ ‘user_id’ ‘order_time’ ‘item_total’ ‘shipping_cost’
‘discounts_applied’ ‘payment_reject’]
In [21]:
train_df.head()
Out[21]:
id
user_id
order_time
item_total
shipping_cost
discounts_applied
payment_reject
0
1
9852
2016-01-01 0:03:11
60.69
10
NaN
False
1
2
2784
2016-01-01 0:09:32
123.91
15
NaN
False
2
3
1619
2016-01-01 0:17:26
119.75
15
NaN
False
3
4
9725
2016-01-01 0:44:04
151.92
15
NaN
False
4
5
8892
2016-01-01 0:52:23
153.27
15
NaN
False
In [22]:
train_df.info()
RangeIndex: 51738 entries, 0 to 51737
Data columns (total 7 columns):
id 51738 non-null int64
user_id 51738 non-null int64
order_time 51738 non-null object
item_total 51738 non-null float64
shipping_cost 51738 non-null int64
discounts_applied 4300 non-null float64
payment_reject 51738 non-null bool
dtypes: bool(1), float64(2), int64(3), object(1)
memory usage: 2.4+ MB
In [31]:
test_df.info()
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
id 10000 non-null int64
gender 6690 non-null object
age 10000 non-null int64
country 10000 non-null object
days_on_site_in_2016 10000 non-null int64
dtypes: int64(3), object(2)
memory usage: 390.7+ KB
In [25]:
train_df.describe()
Out[25]:
id
user_id
item_total
shipping_cost
discounts_applied
count
51738.000000
51738.000000
51738.000000
51738.000000
4300.0
mean
25869.500000
5006.494646
126.398776
13.994646
5.0
std
14935.618451
2890.480242
35.045324
3.941930
0.0
min
1.000000
2.000000
43.040000
10.000000
5.0
25%
12935.250000
2543.000000
100.750000
10.000000
5.0
50%
25869.500000
4973.000000
126.410000
15.000000
5.0
75%
38803.750000
7487.000000
152.047500
15.000000
5.0
max
51738.000000
10000.000000
222.960000
25.000000
5.0
In [32]:
test_df.describe()
Out[32]:
id
age
days_on_site_in_2016
count
10000.00000
10000.00000
10000.000000
mean
5000.50000
37.57790
10.369800
std
2886.89568
9.83994
9.916132
min
1.00000
21.00000
1.000000
25%
2500.75000
29.00000
3.000000
50%
5000.50000
38.00000
7.000000
75%
7500.25000
46.00000
14.000000
max
10000.00000
54.00000
87.000000
Analyze by visualizing data¶
In [34]:
#train_df.plot(x=’col_item_total’, y=’col_shipping_cost’, style=’0′)
—————————————————————————
KeyError Traceback (most recent call last)
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ‘col_item_total’
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
—-> 1 train_df.plot(x=’col_item_total’, y=’col_shipping_cost’, style=’0′)
/anaconda3/lib/python3.6/site-packages/pandas/plotting/_core.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
2675 fontsize=fontsize, colormap=colormap, table=table,
2676 yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 2677 sort_columns=sort_columns, **kwds)
2678 __call__.__doc__ = plot_frame.__doc__
2679
/anaconda3/lib/python3.6/site-packages/pandas/plotting/_core.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
1900 yerr=yerr, xerr=xerr,
1901 secondary_y=secondary_y, sort_columns=sort_columns,
-> 1902 **kwds)
1903
1904
/anaconda3/lib/python3.6/site-packages/pandas/plotting/_core.py in _plot(data, x, y, subplots, ax, kind, **kwds)
1707 if is_integer(x) and not data.columns.holds_integer():
1708 x = data.columns[x]
-> 1709 data = data.set_index(x)
1710
1711 if y is not None:
/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
3144 names.append(None)
3145 else:
-> 3146 level = frame[col]._values
3147 names.append(col)
3148 if drop:
/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ‘col_item_total’
In [ ]:
x = np.
y = np.
model = LinearRegression()
model.fit(x,y)