程序代写代做代考 cache In [2]:

In [2]:
from pyspark.sql.functions import *

# lines = spark.read.text(‘wasb://cluster@msbd.blob.core.windows.net/data/adj_noun_pairs.txt’)
lines = spark.read.text(‘/Users/mathilde/Desktop/Self-Practice/adj_noun_pairs.txt’)
lines.show()

+——————–+
| value|
+——————–+
| early radical|
| french revolution|
| pejorative way|
| violent means|
| positive label|
|self-defined anar…|
|political philosophy|
|differ interpreta…|
| relate movement|
| social movement|
|authoritarian ins…|
| most anarchist|
| harmonious society|
|anti-authoritaria…|
|authoritarian str…|
| political structure|
|coercive institution|
|economic institution|
| social relation|
|voluntary associa…|
+——————–+
only showing top 20 rows

In [3]:
# Converting lines into word pairs.
# Data is dirty: some lines have more than 2 words, so filter them out.
# pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
words = lines.select(split(lines[0],’ ‘).alias(‘w’)).filter(size(‘w’)==2)
words.show()
pairs = words.select(words[‘w’][0].alias(‘adj’), words[‘w’][1].alias(‘noun’))
pairs.cache()
pairs.show()

+——————–+
| w|
+——————–+
| [early, radical]|
|[french, revolution]|
| [pejorative, way]|
| [violent, means]|
| [positive, label]|
|[self-defined, an…|
|[political, philo…|
|[differ, interpre…|
| [relate, movement]|
| [social, movement]|
|[authoritarian, i…|
| [most, anarchist]|
|[harmonious, soci…|
|[anti-authoritari…|
|[authoritarian, s…|
|[political, struc…|
|[coercive, instit…|
|[economic, instit…|
| [social, relation]|
|[voluntary, assoc…|
+——————–+
only showing top 20 rows

+——————+————–+
| adj| noun|
+——————+————–+
| early| radical|
| french| revolution|
| pejorative| way|
| violent| means|
| positive| label|
| self-defined| anarchist|
| political| philosophy|
| differ|interpretation|
| relate| movement|
| social| movement|
| authoritarian| institution|
| most| anarchist|
| harmonious| society|
|anti-authoritarian| society|
| authoritarian| structure|
| political| structure|
| coercive| institution|
| economic| institution|
| social| relation|
| voluntary| association|
+——————+————–+
only showing top 20 rows

In [4]:
pairs.show()

+——————+————–+
| adj| noun|
+——————+————–+
| early| radical|
| french| revolution|
| pejorative| way|
| violent| means|
| positive| label|
| self-defined| anarchist|
| political| philosophy|
| differ|interpretation|
| relate| movement|
| social| movement|
| authoritarian| institution|
| most| anarchist|
| harmonious| society|
|anti-authoritarian| society|
| authoritarian| structure|
| political| structure|
| coercive| institution|
| economic| institution|
| social| relation|
| voluntary| association|
+——————+————–+
only showing top 20 rows

In [4]:
N = pairs.count()
In [5]:
N
Out[5]:
3162692
In [6]:
# Compute the frequency of each pair.
# Ignore pairs that not frequent enough
# pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \
# .filter(lambda pf: pf[1] >= 100)

pair_freqs = pairs.groupBy(‘adj’, ‘noun’).count().filter(‘count >= 100’)
In [7]:
pair_freqs.show()

+—————–+————+—–+
| adj| noun|count|
+—————–+————+—–+
| short| story| 1265|
| second| War| 970|
| british| museum| 171|
| many| scientist| 158|
| comic| book| 559|
| southern| state| 127|
| official| language| 682|
| american| culture| 187|
| general| population| 182|
| geographic| coordinate| 133|
| roman| Catholics| 104|
| second| language| 162|
| german| empire| 178|
| fourteenth| amendment| 105|
|polish-lithuanian|Commonwealth| 163|
| 3rd| edition| 100|
| recent| change| 109|
| other| time| 233|
| new| party| 185|
| operate| system| 562|
+—————–+————+—–+
only showing top 20 rows

In [9]:
# Computing the frequencies of the adjectives and the nouns
# a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y)
# n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)

a_freqs = pairs.groupBy(‘adj’).count().withColumnRenamed(‘count’, ‘adjcount’)
n_freqs = pairs.groupBy(‘noun’).count().withColumnRenamed(‘count’, ‘nouncount’)
In [10]:
a_freqs.show()

+————-+——–+
| adj|adjcount|
+————-+——–+
| indigenous| 1753|
| few| 11663|
| everyday| 712|
| online| 1753|
| cautious| 70|
| inverted| 155|
| unequivocal| 34|
| incoming| 341|
| 11-year-old| 12|
| lamian| 1|
| inner| 1491|
|precautionary| 63|
| electrical| 2261|
| recognize| 359|
| cattle-based| 1|
| balding| 10|
| inertial| 284|
| lyrical| 177|
| convergent| 106|
| elongate| 127|
+————-+——–+
only showing top 20 rows

In [11]:
pair_freqs.join(a_freqs, ‘adj’).join(n_freqs, ‘noun’) \
.select(‘adj’, ‘noun’,
log2(col(‘count’)*N/(col(‘adjcount’)*col(‘nouncount’)))
.alias(‘PMI’)) \
.orderBy(desc(‘PMI’)).show()

+—————–+————+——————+
| adj| noun| PMI|
+—————–+————+——————+
| magna| carta|14.410196596376286|
|polish-lithuanian|Commonwealth| 13.07137409960666|
| nitrous| oxide| 12.99060582764508|
| latter-day| Saints|12.649734254024207|
| stainless| steel|12.506597586010825|
| pave| runway|12.482339231599479|
| corporal| punishment|12.191415428592215|
| capital| punishment|12.183256905205052|
| rush| yard| 12.1470236944742|
| globular| cluster|12.109954005340597|
| teutonic| knight|12.074200587806475|
| refractive| index|11.828363002104304|
| spinal| cord|11.815718560868772|
| alcoholic| beverage|11.808523043970219|
| unpaved| runway| 11.79695092191404|
| anglican| Communion|11.752242121990406|
| coaxial| cable|11.684885137122127|
| angular| momentum|11.622460251835882|
| unpaved| km|11.408626883879302|
| mobile| cellular|11.315829787299972|
+—————–+————+——————+
only showing top 20 rows

In [ ]:

In [ ]: