COMP5349_week6_lab_solution
Copyright By PowCoder代写 加微信 powcoder
!pip install pyspark
Collecting pyspark
Downloading pyspark-3.2.1.tar.gz (281.4 MB)
|████████████████████████████████| 281.4 MB 34 kB/s
Collecting py4j==0.10.9.3
Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
|████████████████████████████████| 198 kB 55.2 MB/s
Building wheels for collected packages: pyspark
Building wheel for pyspark (setup.py) … done
Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=6d45448e139daf4d0cbeb87bcb5fc11c70e2955c9d14015c883d7f33d1820bf4
Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
from google.colab import drive
drive.mount(‘/content/drive’)
Mounted at /content/drive
from pyspark import SparkConf, SparkContext
import csv
spark_conf = SparkConf()\
.setAppName(“Week 6 lab solution”)
sc=SparkContext.getOrCreate(spark_conf)
input_path = ‘file:///content/drive/MyDrive/comp5349/’
movie_file = input_path + “movies.csv”
text_file = input_path + “1984_processed.txt”
#movieData = sc.textFile(movie_file)
Q1. Find the top 5 bigrams in 1984 preprocessed text¶
def bigram(line):
“””This function converts line of input text into word level bigram token
record (str): A line of input text file
a list of word level bigram represented by tuple
words = line.strip().split(” “)
if len(words) < 2:
bigrams = zip(words[:-1], words[1:])
results =[]
for bigram in bigrams:
results.append((bigram,1))
return results
#test bigram function
line = "wrote big brother, whether refrained writing it, made"
bigram(line)
[(('wrote', 'big'), 1),
(('big', 'brother,'), 1),
(('brother,', 'whether'), 1),
(('whether', 'refrained'), 1),
(('refrained', 'writing'), 1),
(('writing', 'it,'), 1),
(('it,', 'made'), 1)]
text_rdd = sc.textFile(text_file)
bigram_rdd = text_rdd.flatMap(bigram)
bigram_counts= bigram_rdd.reduceByKey(lambda a, b: a + b)
bigram_counts.sortBy(lambda r: r[1],ascending=False).take(5)
[(('big', 'brother'), 67),
(('said', 'winston'), 43),
(('old', 'man'), 38),
(('thought', 'police'), 38),
(('said', "o'brien"), 37)]
Q2. Find out the year with most Sci-Fi movies released.¶
genre="Sci-Fi"
def filterGenreYear(record):
""" This function filters all movies in genre
and returns a tuple (year, 1),
indicating 1 sci-fi movie in this year.
for row in csv.reader([record]):
if len(row) != 3:
if genre in row[2]:
title = row[1].strip()
year = title[len(title)-5:len(title)-1]
return [(year,1)]
# check function output
record = "111235,Jodorowsky's Dune (2013) ,Documentary|Sci-Fi"
filterGenreYear(record)
[('2013', 1)]
from operator import add
movieData = sc.textFile(movie_file)
movieData.flatMap(filterGenreYear) \
.reduceByKey(add) \
.sortBy(lambda r: r[1],ascending=False) \
[('2009', 49)]
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com