Task1_Jaccard
In [2]:
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg import SparseVector
import sys,os
from operator import add
import time
import copy
import math
In [8]:
sc.stop()
In [12]:
tTotalStart = time.time()
In [9]:
sc = SparkContext(appName=”Zhendong_hw3_task1″)
#read data
INPUT_CSV = “./Data/video_small_num.csv”
mydata = sc.textFile(INPUT_CSV,None,False)
dataHeader = mydata.first()
mydata = mydata.filter(lambda x: x!= dataHeader).map(lambda x: x.split(‘,’))
print mydata
PythonRDD[3] at RDD at PythonRDD.scala:48
In [13]:
def compareValue(old, new):
if new < old:
return new
else:
return old
In [14]:
def minHash(val, l):
val = 1000
for i in range(len(l)):
if l[i] < val:
val = l[i]
return val
In [ ]: