In [1]:
name = input(“enter your name”)
print(“Hello “, name)
enter your nameRoozbeh
Hello Roozbeh
In [2]:
print(‘hello’)
hello
In [ ]:
data types
In [3]:
a = 1
a
Out[3]:
1
In [ ]:
Arithmetic operation
In [7]:
a=5
b=2
a/b
Out[7]:
2.5
In [ ]:
comparison operators
In [6]:
a!=b
Out[6]:
True
In [ ]:
Vectors
In [8]:
x=[1,2,3,4]
In [9]:
3 in x
Out[9]:
True
In [10]:
7 not in x
Out[10]:
True
In [ ]:
tupels
In [11]:
my_tuple = (‘euroka’, 2, 3.0)
my_tuple
Out[11]:
(‘euroka’, 2, 3.0)
In [12]:
my_tuple = my_tuple+(‘abc’, 3)
In [ ]:
dictionary
In [14]:
my_dict={“name”:”Mike”, “Gender”:”F”}
my_dict
Out[14]:
{‘Gender’: ‘F’, ‘name’: ‘Mike’}
In [ ]:
sets
In [15]:
my_set={1,2,”a”}
my_set
Out[15]:
{1, 2, ‘a’}
In [17]:
my_list = [1,2,3]
my_list
Out[17]:
[1, 2, 3]
In [ ]:
Numpy
In [18]:
import numpy as np
np.array(my_list)
Out[18]:
array([1, 2, 3])
In [19]:
my_list = [[1,2,3],[4,5,6],[7,8,9]]
np.array(my_list)
Out[19]:
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
In [25]:
a = np.arange(1,10)
a
Out[25]:
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
In [26]:
a[1]
Out[26]:
2
In [27]:
np.arange(1,10,2)
Out[27]:
array([1, 3, 5, 7, 9])
In [28]:
np.zeros((4,4))
Out[28]:
array([[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]])
In [29]:
import pandas as pd
labels = [‘a’,’b’,’c’]
my_data = [5,10,15]
pd.Series(data = my_data, index= labels)
Out[29]:
a 5
b 10
c 15
dtype: int64
In [ ]:
Example 1
In [36]:
info = {‘content’: [23.1, 32.8, 31.8, 32.0, 30.4, 24.0, 39.5, 24.2, 52.5, 37.9, 30.5, 25.1, 12.4, 35.1, 31.5, 21.1, 27.6],
‘yield’: [10.5, 16.7, 18.2, 17.0, 16.3, 10.5, 23.1, 12.4, 24.9, 22.8, 14.1, 12.9, 8.8, 17.4, 14.9, 10.5, 16.1]}
snake = pd.DataFrame(data=info)
snake
In [ ]:
import matplotlib.pyplot as plt
x = snake[“content”]
y = snake[“yield”]
plt.scatter(x, y, c=”b”, alpha=0.5)
plt.xlabel(“Water content of snow “)
plt.ylabel(“Water yield”)
plt.title(“Water content of snow vs. Water yield”)
plt.show()
In [ ]:
fit = np.polyfit(x, y,1)
fit_fn = np.poly1d(fit)
In [ ]:
# Fitted linear regression
plt.plot(x, y, ‘bo’, x, fit_fn(x), ‘-k’)
plt.xlabel(“Water content of snow “)
plt.ylabel(“Water yield”)
plt.title(“Water content of snow vs. Water yield”)
plt.show()
In [ ]:
Example 2
In [30]:
train = pd.read_csv(‘C:/My Courses/Latespring2018/ANLY530/Lecture05/titanic_train.csv’)
train.head(3)
Out[30]:
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
In [31]:
train.isnull().any()
train[‘Has_Cabin’] = train[“Cabin”].apply(lambda x: 0 if type(x) == float else 1)
In [32]:
# Remove all NULLS in the Embarked column
#for dataset in train:
# dataset[‘Embarked’] = dataset[‘Embarked’].fillna(‘S’)
# Remove all NULLS in the Fare column
#for dataset in train:
# dataset[‘Fare’] = dataset[‘Fare’].fillna(train[‘Fare’].median())
# Remove all NULLS in the Age column
age_avg = train[‘Age’].mean()
age_std = train[‘Age’].std()
age_null_count = train[‘Age’].isnull().sum()
age_null_random_list = np.random.randint(age_avg – age_std, age_avg + age_std, size=age_null_count)
# Next line has been improved to avoid warning
train.loc[np.isnan(train[‘Age’]), ‘Age’] = age_null_random_list
train[‘Age’] = train[‘Age’].astype(int)
# Male are more so let’s put male instead of missed value
train[‘Sex’] = train[‘Sex’].fillna(‘male’)
# Mapping Sex
train[‘Sex’] = train[‘Sex’].map({‘male’: 0, ‘female’: 1})
# Mapping Embarked
train[‘Embarked’] = train[‘Embarked’].map({‘S’: 0, ‘C’: 1, ‘Q’: 2})
In [33]:
# Feature selection: remove variables no longer containing relevant information
drop_elements = [‘PassengerId’, ‘Name’, ‘Ticket’, ‘Cabin’, ‘SibSp’]
train = train.drop(drop_elements, axis = 1)
train.head(3)
Out[33]:
Survived
Pclass
Sex
Age
Parch
Fare
Embarked
Has_Cabin
0
0
3
0
22
0
7.2500
0
0
1
1
1
1
38
0
71.2833
1
1
2
1
1
1
35
0
53.1000
0
1
In [34]:
age_null_count = train[‘Age’].isnull().sum()
In [35]:
# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our models
y_train = train[‘Survived’]
x_train = train.drop([‘Survived’], axis=1).values
In [ ]:
model = tree.DecisionTreeClassifier()
model = model.fit(x_train, y_train)