import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
# Load the csv
df = pd.DataFrame()
df = pd.read_csv(‘../input/user-languages.csv’)
# Remove names, to have only float values
try:
del(df[‘user_id’])
except Exception:
print (“Error”, Exception)
# Small sample
df.iloc[:5,3:8]
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
N_DEVELOPERS = 17000
df_reduced = df.tail(N_DEVELOPERS)
scores = []
for n_clusters in range(3,13):
kmeans = KMeans(n_clusters = n_clusters, random_state = 0).fit(df_reduced)
labels = kmeans.labels_
silhouette_avg = silhouette_score(df_reduced, labels)
print(“For developers =”, N_DEVELOPERS ,” and n_clusters =”, n_clusters, “The average silhouette_score is :”, silhouette_avg)
scores.append(silhouette_avg)
scores
For developers = 17000 and n_clusters = 3 The average silhouette_score is : 0.114145734808
For developers = 17000 and n_clusters = 4 The average silhouette_score is : 0.130235327952
For developers = 17000 and n_clusters = 5 The average silhouette_score is : 0.138070874843
For developers = 17000 and n_clusters = 6 The average silhouette_score is : 0.144446465405
For developers = 17000 and n_clusters = 7 The average silhouette_score is : 0.105723284188
For developers = 17000 and n_clusters = 8 The average silhouette_score is : 0.149984291193
For developers = 17000 and n_clusters = 9 The average silhouette_score is : 0.116698114741
For developers = 17000 and n_clusters = 10 The average silhouette_score is : 0.120199226544
For developers = 17000 and n_clusters = 11 The average silhouette_score is : 0.123118952876
For developers = 17000 and n_clusters = 12 The average silhouette_score is : 0.0833308249635
Out[2]:
[0.11414573480812767,
0.13023532795215215,
0.13807087484342054,
0.14444646540548431,
0.10572328418828708,
0.1499842911925803,
0.11669811474070241,
0.12019922654379787,
0.1231189528756723,
0.083330824963460268]
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
score = [0.11414573480812767,
0.13023532795215215,
0.13807087484342054,
0.14444646540548431,
0.10572328418828708,
0.1499842911925803,
0.11669811474070241,
0.12019922654379787,
0.1231189528756723,
0.083330824963460268]
all_scores = [None,None,None] + scores
plt.plot((all_scores))
plt.title(‘Silhouette scores, for number of clusters’)
plt.show()
import operator
import pandas as pd
from sklearn.cluster import KMeans
n_clusters = 8
kmeans = KMeans(n_clusters = n_clusters, random_state = 0).fit(df)
labels = kmeans.labels_#
#
roles = pd.DataFrame()# Glue back to originaal data
#
df[‘clusters’] = labels
label_df = []
for cluster in range(n_clusters):
sub_df = df[df[‘clusters’] == cluster]
dict_tags = {}
for column in sub_df.columns:
if sub_df[column].sum() > 0: dict_tags[column] = sub_df[column].sum()#
dict_tags.pop(‘clusters’, None)
sorted_dict_tags = sorted(dict_tags.items(), key = operator.itemgetter(1))
my_type = pd.DataFrame.from_dict(sorted_dict_tags).tail(10)
my_type.columns = [‘Skill’ , ‘Weight’ ]
print(“Type: “, cluster , ” ” ,sub_df.shape[0]/df.shape[0]*100 ,” % of users” )#
print(my_type)
new_role_element = pd.DataFrame.from_dict(sorted_dict_tags).tail(10).T.iloc[0: 2]
new_role_element.columns = pd.DataFrame.from_dict(sorted_dict_tags).tail(10).T.iloc[0]
total = float(sub_df.shape[0])
new_role_element = new_role_element.iloc[1: 2] / total
roles = pd.concat((new_role_element, roles))
roles.fillna(0, inplace=True)
Type: 0 4.524368592864096 % of users
Skill Weight
814 html 10.309532
815 shell 12.288928
816 library 12.692214
817 css 13.658834
818 framework 13.724775
819 wordpress 15.656482
820 simple 16.767843
821 laravel 17.145426
822 javascript 48.470819
823 php 177.195450
Type: 1 1.0136876467556268 % of users
Skill Weight
247 angular 1.466733
248 google 1.524337
249 build 1.740751
250 ruby 2.301311
251 simple 2.383679
252 website 2.974862
253 theme 3.568978
254 javascript 8.037469
255 html 52.729948
256 css 55.594205
Type: 2 8.951377355248841 % of users
Skill Weight
903 framework 17.497450
904 css 18.681031
905 html 18.835961
906 library 23.068065
907 files 24.096917
908 shell 25.928898
909 simple 30.999944
910 rails 70.087772
911 javascript 85.752403
912 ruby 382.321965
Type: 3 7.049997136475575 % of users
Skill Weight
835 html 13.714895
836 example 13.867262
837 kotlin 14.554947
838 simple 16.659241
839 python 16.802153
840 google 17.366880
841 javascript 30.950143
842 library 31.352717
843 android 158.767554
844 java 313.243258
Type: 4 9.592806826642231 % of users
Skill Weight
930 c 20.403500
931 c++ 21.301816
932 files 21.849977
933 library 24.864481
934 html 25.193654
935 simple 25.211851
936 shell 26.387056
937 django 42.871558
938 javascript 67.216931
939 python 446.095898
Type: 5 25.365099364297578 % of users
Skill Weight
1139 python 45.738051
1140 ruby 46.185570
1141 files 48.629549
1142 angular 49.357477
1143 library 58.929825
1144 simple 70.543248
1145 html 86.535675
1146 css 86.967689
1147 react 96.777150
1148 javascript 1235.883843
Type: 6 37.454899490292654 % of users
Skill Weight
1249 library 106.382199
1250 html 109.401675
1251 java 114.186500
1252 c 117.080352
1253 ruby 120.670388
1254 c++ 127.096563
1255 shell 159.814733
1256 go 171.515901
1257 python 201.921469
1258 javascript 400.154429
Type: 7 6.0477635874234 % of users
Skill Weight
793 phone 10.531609
794 cocoa 11.999170
795 image 12.265055
796 library 16.946037
797 framework 17.086471
798 ruby 17.752189
799 simple 19.643175
800 javascript 24.741107
801 swift 141.650751
802 objective-c 178.567239
from math import pi
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, HTML
def show_graph(cat, values, title):
N = len(cat)
x_as = [n / float(N) * 2 * pi for n in range(N)]
# Because our chart will be circular we need to append a copy of the first
# value of each list at the end of each list with data
values += values[:1]
x_as += x_as[:1]
# Set color of axes
plt.rc(‘axes’, linewidth=0.5, edgecolor=”#888888″)
# Create polar plot
plt.figure(figsize=(15,7.5))
ax = plt.subplot(111, polar=True)
# Set clockwise rotation. That is:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
# Set position of y-labels
ax.set_rlabel_position(0)
# Set color and linestyle of grid
ax.xaxis.grid(True, color=”#888888″, linestyle=’solid’, linewidth=0.5)
ax.yaxis.grid(True, color=”#888888″, linestyle=’solid’, linewidth=0.5)
# Set number of radial axes and remove labels
plt.xticks(x_as[:-1], [])
# Set yticks
#plt.yticks([20, 40, 60, 80, 100], [“20”, “40”, “60”, “80”, “100”])
# Plot data
ax.plot(x_as, values, linewidth=0, linestyle=’solid’, zorder=3)
# Fill area
ax.fill(x_as, values, ‘b’, alpha=0.3)
# Set axes limits
plt.ylim(0,100)
plt.title(title)
# Draw ytick labels to make sure they fit properly
for i in range(N):
angle_rad = i / float(N) * 2 * pi
if angle_rad == 0:
ha, distance_ax = “center”, 10
elif 0 < angle_rad < pi:
ha, distance_ax = "left", 1
elif angle_rad == pi:
ha, distance_ax = "center", 1
else:
ha, distance_ax = "right", 1
ax.text(angle_rad, 100 + distance_ax, cat[i], size=8, horizontalalignment=ha, verticalalignment="center")
# Show polar plot
plt.show()
def Get_Description(cat) :
# Return developer description for a given skill set
DEVELOPER_TYPES = [
"Apple Developer",
"Android with Java",
"Multi language Jedi Developer",
"Python with django ",
"React Angular" ,
"PHP Developer",
"Ruby on Rails",
"Static HTML Designer" ,
"Unkown"]
type_index = 8 # Default value
if "c++" in cat : type_index = 2
if "android" in cat : type_index = 1
if "django" in cat : type_index = 3
if "react" in cat : type_index = 4
if "rails" in cat : type_index = 6
if "website" in cat : type_index = 7
if "swift" in cat : type_index = 0
if "wordpress" in cat : type_index = 5
return DEVELOPER_TYPES[type_index]
role_index = []
j = 0
for index, row in roles.iterrows():
cat = []
values = []
for column in roles.columns:
if row[column] > 0 :
cat.append(column)
values.append( row[column] / np.sum(row) * 100 )
developer_description = Get_Description(cat[:10])
skills = pd.DataFrame()
skills[‘Skill’] = cat[:10]
skills[‘Weight’] = values[:10]
print (“Developer type : ” , developer_description)
role_index.append(developer_description)
display(skills.sort_values(‘Weight’, ascending = False ))
show_graph(cat = cat[:10], values = values[:10], title = developer_description)
j = j +1
Developer type : Apple Developer
Developer type : Multi language Jedi Developer
Developer type : React Angular
Developer type : Python with django
Developer type : Android with Java
Developer type : Ruby on Rails
Developer type : Static HTML Designer
Developer type : PHP Developer
from scipy.spatial.distance import squareform, pdist
roles.index = role_index
display(roles)
res = pdist(roles, ‘euclidean’)
squareform(res)
roles_dist = pd.DataFrame(squareform(res), index=role_index, columns=role_index)
roles_dist
import seaborn as sns
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(18, 18))
plt.title(“Developer type heatmap”)
sns.heatmap(roles_dist , annot=True, linewidths=.5, fmt= ‘.1f’,ax=ax , cmap=’viridis’ )
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
nx.draw(H, with_labels=False, node_size=30)
plt.show()
/docProps/thumbnail.jpeg