代写代考 ISO-8859-1") - PowCoder代写

Assignment2_Tutorial-1

#———————————————-
# Import Libraries
#———————————————-

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, auc, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
# from sklearn.cross_validation import KFold # old version

from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

cur_dir = os.getcwd()

Step 1 – Data Cleaning and Data Preparation¶

# Using a small subset of original data
# full data can be found on Kaggle : https://www.kaggle.com/c/job-salary-prediction
Salaries = pd.read_csv(“Salary_Train_sample.csv”, encoding = “ISO-8859-1”)
Salaries_Train,Salaries_Test = train_test_split(Salaries,test_size=0.33, random_state=13)

Salaries.head()

Id Title FullDescription LocationRaw LocationNormalized ContractType ContractTime Company Category SalaryRaw SalaryNormalized SourceName
0 12612628 Engineering Systems Analyst Engineering Systems Analyst Sal… Dorking, Surrey, Surrey Dorking NaN permanent International Engineering Jobs 20000 – 30000/annum 20-30K 25000 cv-library.co.uk
1 12612830 Stress Engineer Glasgow Stress Engineer Glasgow Salary **** to **** We… Glasgow, Scotland, Scotland Glasgow NaN permanent International Engineering Jobs 25000 – 35000/annum 25-35K 30000 cv-library.co.uk
2 12612844 Modelling and simulation analyst Mathematical Modeller / Simulation Analyst / O… Hampshire, South East, South East Hampshire NaN permanent International Engineering Jobs 20000 – 40000/annum 20-40K 30000 cv-library.co.uk
3 12613049 Engineering Systems Analyst / Mathematical Mod… Engineering Systems Analyst / Mathematical Mod… Surrey, South East, South East Surrey NaN permanent International Engineering Jobs 25000 – 30000/annum 25K-30K negotiable 27500 cv-library.co.uk
4 12613647 Pioneer, Systems Analyst Pioneer, Systems Analyst Do… Surrey, South East, South East Surrey NaN permanent International Engineering Jobs 20000 – 30000/annum 20-30K 25000 cv-library.co.uk

Data prep function – explanations for each step provided below

# Function to Clean Data – Explanations Provided in Cells Below

def clean_null(Salaries):
Salaries.dropna(subset=[‘Title’],inplace=True)
Salaries[‘ContractType’].fillna(Salaries[‘ContractType’].mode()[0],inplace=True)
Salaries.loc[Salaries[‘ContractTime’].isnull(), ‘ContractTime’] = ‘Unknown’
Salaries.loc[Salaries[‘Company’].isnull(), ‘Company’] = ‘Unknown’
return Salaries

def featurize(Salaries_Train, Salaries_Test):

vectorizer = CountVectorizer(analyzer = “word”,
tokenizer = None,
preprocessor = None,
stop_words = ‘english’,
max_features = 200,
ngram_range = (1,2))\
.fit(Salaries_Train[‘FullDescription’])

train_words = vectorizer.transform(Salaries_Train[‘FullDescription’])
test_words = vectorizer.transform(Salaries_Test[‘FullDescription’])

title_vectorizer = vectorizer = CountVectorizer(analyzer = “word”, tokenizer = None,
preprocessor = None,
stop_words = ‘english’,
max_features = 200,
ngram_range = (2,3))\
.fit(Salaries_Train[‘Title’])

train_title_words = title_vectorizer.transform(Salaries_Train[‘Title’])
test_title_words = title_vectorizer.transform(Salaries_Test[‘Title’])

location_counts = Salaries_Train.LocationNormalized.value_counts()
value_mask = Salaries_Train.LocationNormalized.isin(location_counts.index[location_counts < 100]) Salaries_Train.loc[value_mask,'LocationNormalized'] = "Other" Salaries_Test.loc[Salaries_Test.LocationNormalized.isin(list(location_counts.index[location_counts > 100])) == False,\
“LocationNormalized”] = “Other”

Salaries_Train = pd.get_dummies(data=Salaries_Train, columns=[‘LocationNormalized’, ‘ContractType’,’Category’,’ContractTime’])
Salaries_Test = pd.get_dummies(data=Salaries_Test, columns=[‘LocationNormalized’, ‘ContractType’,’Category’,’ContractTime’])

# In case there are missing columns in Test
missing_cols = set( Salaries_Train.columns ) – set(Salaries_Test.columns )
for column in missing_cols:
Salaries_Test[column] = 0
Salaries_Test = Salaries_Test[Salaries_Train.columns]

#Combine all features into sparse dataframe

#TRAIN ——————————————————-

features_train = Salaries_Train.drop([‘FullDescription’,
‘Title’,’Id’,’LocationRaw’,’Company’,
‘SalaryRaw’,’SourceName’], axis=1)
title_train = pd.DataFrame(data = train_title_words.toarray(), columns = title_vectorizer.get_feature_names())
description_train = pd.DataFrame(data = train_words.toarray(), columns = vectorizer.get_feature_names())

features_train.reset_index(drop=True, inplace=True)
title_train.reset_index(drop=True, inplace=True)
description_train.reset_index(drop=True, inplace=True)

Salaries_Train = pd.concat([features_train,title_train,description_train], axis = 1)

Salaries_Y = Salaries_Train[‘SalaryNormalized’]
Salaries_X = Salaries_Train.drop([‘SalaryNormalized’], axis=1)

#TEST ——————————————————-

features_test = Salaries_Test.drop([‘FullDescription’,
‘Title’,’Id’,’LocationRaw’,’Company’,
‘SalaryRaw’,’SourceName’], axis=1)
title_test = pd.DataFrame(data = test_title_words.toarray(), columns = title_vectorizer.get_feature_names())
description_test = pd.DataFrame(data = test_words.toarray(), columns = vectorizer.get_feature_names())

features_test.reset_index(drop=True, inplace=True)
title_test.reset_index(drop=True, inplace=True)
description_test.reset_index(drop=True, inplace=True)

Salaries_Test = pd.concat([features_test,title_test,description_test], axis = 1)

Salaries_Y_Test = Salaries_Test[‘SalaryNormalized’]
Salaries_X_Test = Salaries_Test.drop([‘SalaryNormalized’], axis=1)

return Salaries_X, Salaries_Y, Salaries_X_Test, Salaries_Y_Test

Salaries_Train = clean_null(Salaries_Train)
Salaries_Test = clean_null(Salaries_Test)

/home/jupyterlab/conda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
/home/jupyterlab/conda/lib/python3.6/site-packages/pandas/core/generic.py:5430: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
/home/jupyterlab/conda/lib/python3.6/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self.obj[item] = s

Salaries_X, Salaries_Y, Salaries_X_Test, Salaries_Y_Test = featurize(Salaries_Train, Salaries_Test)

/home/jupyterlab/conda/lib/python3.6/site-packages/pandas/core/indexing.py:543: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self.obj[item] = s

Salaries_X_Test.shape

(5940, 447)

Logic for Data Prep¶

1 a) – Cleaning NaN

Salaries = pd.read_csv(“Salary_Train_sample.csv”, encoding = “ISO-8859-1”)

# See how many null values are in each column
Salaries.isnull().sum(axis=0)

Id 0
Title 1
FullDescription 0
LocationRaw 0
LocationNormalized 0
ContractType 10780
ContractTime 8893
Company 6111
Category 0
SalaryRaw 0
SalaryNormalized 0
SourceName 0
dtype: int64

Salaries[Salaries[‘Title’].isna()]

Id Title FullDescription LocationRaw LocationNormalized ContractType ContractTime Company Category SalaryRaw SalaryNormalized SourceName
1588 48271669 NaN Quality Improvement Manager North West England… Liverpool, Merseyside Liverpool full_time NaN NaN Healthcare & Nursing Jobs 40,000 to 45,000 per year 42500 careworx.co.uk

# There are very few cases where the title is not provided, therefore these will be simply removed
Salaries = Salaries.dropna(subset=[‘Title’])

# could drop rows (or columns) above a certain number of nulls
# thresh = Require that many non-NA values.
Salaries.dropna(axis=0, how=’any’, thresh=9, subset=None, inplace=True)

# There are a lot of null ContractType positions
Salaries[‘ContractType’].unique()

array([nan, ‘full_time’, ‘part_time’], dtype=object)

# Can see that most Contract Types are full-time
Salaries[‘ContractType’].value_counts(normalize=True)

full_time 0.842083
part_time 0.157917
Name: ContractType, dtype: float64

# There are several methods to determine how to handle NaN values
#- including dropping them, filling with mode or mean, or replacing with an “Other” value
# start with data exploration – can check if there a pattern in where the NaN values appear or if the rows with NaN are randomly distributed

# For Example – is the a pattern in job categories where NaN contract type appears or is it similar to the full data set?

a = Salaries[‘Category’].value_counts(normalize=True)
b = Salaries[Salaries[‘ContractType’].isnull()][‘Category’].value_counts(normalize=True)
print (pd.DataFrame({‘Alldata’: a, ‘NaN ContractType’:b}))

Alldata NaN ContractType
Accounting & Finance Jobs 0.064559 0.059091
Admin Jobs 0.025501 0.020501
Charity & Voluntary Jobs 0.007889 0.006772
Consultancy Jobs 0.012334 0.008813
Creative & Design Jobs 0.004000 0.001855
Customer Services Jobs 0.026057 0.024861
Domestic help & Cleaning Jobs 0.001056 0.001113
Energy, Oil & Gas Jobs 0.004111 0.005473
Engineering Jobs 0.095172 0.115863
Graduate Jobs 0.001667 0.000742
HR & Recruitment Jobs 0.042558 0.059740
Healthcare & Nursing Jobs 0.223901 0.188683
Hospitality & Catering Jobs 0.050447 0.064750
IT Jobs 0.149064 0.187662
Legal Jobs 0.009167 0.013451
Logistics & Warehouse Jobs 0.021057 0.013729
Maintenance Jobs 0.010723 0.000278
Manufacturing Jobs 0.021946 0.015492
Other/General Jobs 0.040280 0.028479
PR, Advertising & Marketing Jobs 0.021946 0.011688
Property Jobs 0.005334 0.008534
Retail Jobs 0.010112 0.012709
Sales Jobs 0.055836 0.070594
Scientific & QA Jobs 0.010889 0.003618
Social work Jobs 0.009334 0.012987
Teaching Jobs 0.041058 0.031633
Trade & Construction Jobs 0.023279 0.016790
Travel Jobs 0.010723 0.014100

# fill contracttype with mode
Salaries[‘ContractType’].fillna(Salaries[‘ContractType’].mode()[0],inplace=True)

#Contract time

Salaries[‘ContractTime’].value_counts(normalize=True)

permanent 0.845833
contract 0.154167
Name: ContractTime, dtype: float64

# Comparing job categories for rows with NaN contract time
a = Salaries[‘Category’].value_counts(normalize=True)
b = Salaries[Salaries[‘ContractTime’].isnull()][‘Category’].value_counts(normalize=True)
print (pd.DataFrame({‘Alldata’: a, ‘NaN ContractTime’:b}))

Alldata NaN ContractTime
Accounting & Finance Jobs 0.064559 0.044647
Admin Jobs 0.025501 0.037000
Charity & Voluntary Jobs 0.007889 0.001350
Consultancy Jobs 0.012334 0.015857
Creative & Design Jobs 0.004000 0.002474
Customer Services Jobs 0.026057 0.033288
Domestic help & Cleaning Jobs 0.001056 0.001462
Energy, Oil & Gas Jobs 0.004111 0.003036
Engineering Jobs 0.095172 0.037562
Graduate Jobs 0.001667 0.002474
HR & Recruitment Jobs 0.042558 0.036550
Healthcare & Nursing Jobs 0.223901 0.419703
Hospitality & Catering Jobs 0.050447 0.091430
IT Jobs 0.149064 0.021480
Legal Jobs 0.009167 0.011359
Logistics & Warehouse Jobs 0.021057 0.029802
Maintenance Jobs 0.010723 0.019568
Manufacturing Jobs 0.021946 0.023054
Other/General Jobs 0.040280 0.038574
PR, Advertising & Marketing Jobs 0.021946 0.013495
Property Jobs 0.005334 0.004611
Retail Jobs 0.010112 0.003261
Sales Jobs 0.055836 0.038574
Scientific & QA Jobs 0.010889 0.007872
Social work Jobs 0.009334 0.006973
Teaching Jobs 0.041058 0.032051
Trade & Construction Jobs 0.023279 0.022155
Travel Jobs 0.010723 0.000337

There is a higher proportion of jobs in healthcare, hospitality and catering and ‘other’ where the contract time is NaN. Although this may require some futher exploration, we will now fill them with a filler ‘Unknown’ variable.

Salaries.loc[Salaries[‘ContractTime’].isnull(), ‘ContractTime’] = ‘Unknown’

# Next, missing Company Names are dealt with
Salaries[Salaries[‘Company’].isnull()][‘FullDescription’][0:10]

86 An exciting job opportunity has arisen to work…
93 Job Title: Java Developer Location: Yorkshire …
113 Transaction Services Executive **** **** plus…
114 Audit Senior **** Bristol My client, a reputab…
115 Audit Assistant Manager Bristol **** **** This…
120 Sales Account Executive Basingstoke Salary ***…
130 Excellent new job and career opportunity for a…
137 Barclays are hiring. Future Leaders Developmen…
153 Our client, a global service provider, require…
182 Support Worker Job Chepstow We are a leading p…
Name: FullDescription, dtype: object

Some data exploration shows that missing companies names are caused by a variety of factors – Some posings may have the company name in the full description, while others are third party recruiters, recruiting for a client. NaNs will be filled with a ‘Unknown’ value. This value might capture recruitment by a third party. This could be further refined – for example filling with ‘Third Party’ IF Description contains ‘Client’.

Salaries.loc[Salaries[‘Company’].isnull(), ‘Company’] = ‘Unknown’

1 b) Cleaning Text Data

The dataset contains a significant amount of uncleaned and unstructured text in the ‘Title’ and ‘Full Description’ columns.
A simple approach would be to clean up the text, remove stop words, and do one-hot encoding using countvectorizer.

vectorizer = CountVectorizer(analyzer = “word”,
tokenizer = None,
preprocessor = None,
stop_words = ‘english’,
max_features = 200,
ngram_range = (1,2))

Words = vectorizer.fit_transform(Salaries[‘FullDescription’])

print(vectorizer.get_feature_names()[0:25])

[‘ability’, ‘able’, ‘agency’, ‘applicants’, ‘application’, ‘applications’, ‘apply’, ‘appropriate’, ‘area’, ‘areas’, ‘assist’, ‘available’, ‘background’, ‘based’, ‘basis’, ‘benefits’, ‘best’, ‘business’, ‘candidate’, ‘candidates’, ‘care’, ‘career’, ‘chef’, ‘client’, ‘clients’]

The most common words seem to be relevant to salary information, although more stopwords could possibly be added to list. A better approach may be to set a high # of max_features and run a linear regression for feature selection.

Several of the most frequent words such ‘car,’linux’,’worker’,’digital’ look like they belong to a sequence of words, so the vectorizer could benefit from using n – grams.

title_vectorizer = CountVectorizer(analyzer = “word”, tokenizer = None,
preprocessor = None,
stop_words = ‘english’,
max_features = 200,
ngram_range = (2,3))

Title_Words = title_vectorizer.fit_transform(Salaries[‘Title’])
print(title_vectorizer.get_feature_names()[0:10])

[‘account director’, ‘account executive’, ‘account manager’, ‘accounts assistant’, ‘assistant job’, ‘assistant jobs’, ‘assistant manager’, ‘award winning’, ‘branch manager’, ‘building services’]

The features from n-grams seem more appropriate.

#Salaries.Company.value_counts()

Salaries.LocationNormalized.nunique()

Because there are a lot of unique company names, it may be excessive to get dummies for each one, and there may not be enough examples of each company name to be significant in model training. An option would be to get dummy variables for the top 100 most popular companies or companies above a threshold count.

There is also a significant amount of unique Company locations (2000+). To allow the models to run in reasonable time and eliminate locations that only appear once, we will only consider locations that appear above a certain frequency. Everything else we can replace with “Other”.

location_counts = Salaries.LocationNormalized.value_counts()
value_mask = Salaries.LocationNormalized.isin(location_counts.index[location_cou

程序代写 CS代考加微信: powcoder QQ: 1823890830 Email: powcoder@163.com

Related Posts