from google.colab import drive
Mounted at /content/drive
from pyspark import SparkConf, SparkContext
import csv
spark_conf = SparkConf()\
.setAppName(“Week 6 lab solution”)
input_path = ‘file:///content/drive/MyDrive/comp5349/’
movie_file = input_path + “movies.csv”
text_file = input_path + “1984_processed.txt”
#movieData = sc.textFile(movie_file)
Q1. Find the top 5 bigrams in 1984 preprocessed text¶
def bigram(line):
“””This function converts line of input text into word level bigram token
record (str): A line of input text file
a list of word level bigram represented by tuple
words = line.strip().split(” “)
if len(words) < 2:
bigrams = zip(words[:-1], words[1:])
results =[]
for bigram in bigrams:
return results
#test bigram function
line = "wrote big brother, whether refrained writing it, made"
[(('wrote', 'big'), 1),
(('big', 'brother,'), 1),
(('brother,', 'whether'), 1),
(('whether', 'refrained'), 1),
(('refrained', 'writing'), 1),
(('writing', 'it,'), 1),
(('it,', 'made'), 1)]
text_rdd = sc.textFile(text_file)
bigram_rdd = text_rdd.flatMap(bigram)
bigram_counts= bigram_rdd.reduceByKey(lambda a, b: a + b)
bigram_counts.sortBy(lambda r: r[1],ascending=False).take(5)
[(('big', 'brother'), 67),
(('said', 'winston'), 43),
(('old', 'man'), 38),
(('thought', 'police'), 38),
(('said', "o'brien"), 37)]
Q2. Find out the year with most Sci-Fi movies released.¶
def filterGenreYear(record):
""" This function filters all movies in genre
and returns a tuple (year, 1),
indicating 1 sci-fi movie in this year.
for row in csv.reader([record]):
if len(row) != 3:
if genre in row[2]:
title = row[1].strip()
year = title[len(title)-5:len(title)-1]
return [(year,1)]
# check function output
record = "111235,Jodorowsky's Dune (2013) ,Documentary|Sci-Fi"
[('2013', 1)]
from operator import add
movieData = sc.textFile(movie_file)
movieData.flatMap(filterGenreYear) \
.reduceByKey(add) \
.sortBy(lambda r: r[1],ascending=False) \
[('2009', 49)]
