Question 4
mapper.py
#!/usr/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
arr = line.split()
for n in arr:
try:
num = int(n)
if num % 2 !=0:
print n + “\t1”
except Exception as e:
pass
Reducer.py
#!/usr/bin/env python
import sys
current_num = None
num = None
count = 0
for line in sys.stdin:
line = line.strip()
num, _ = line.split(‘\t’, 1)
num = int(num)
if current_num != num:
count += 1
current_num = num
print count
Hadoop Commands
Use 1 reducer.
hadoop fs -mkdir /q4input
hadoop fs -put NumbersAndStrings.txt /q4input
time hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar \
-D mapred.reduce.tasks=1 \
-file /home/ec2-user/part4/mapper.py -mapper mapper.py \
-file /home/ec2-user/part4/reducer.py -reducer reducer.py \
-input /q4input -output /q4outputb
hadoop fs -cat /q4outputb/*
Screenshot