task_2
FIT5196 Assessment 1¶
Student Name:¶
Student ID:¶
Date: 02/04/2017
Version: 2.0
Environment: Python 3.6.0 and Anaconda 4.3.0 (64-bit)
Libraries used:
collections (for calculation frequency )
re 2.2.1 (for regular expression)
os (for join path, split file name, check the file if exists)
1. Introduction¶
his task is to build sparse representations for the meeting transcripts generated in task 1, which includes word tokenization, vocabulary generation, and the generation of sparse representations.
2. Import libraries¶
In [2]:
from collections import defaultdict, Counter
from os import listdir
from os.path import isfile, join, split, exists, splitext
import re
3. Word tokenization, vocabulary generation¶
Structure stop word list.
Loop reading the txt file generated by task 1 for word segmentation, deleting stop words, and summing up all words.
Calculate the frequency of words and delete words that exceed the frequency of 132.
the vocabulary was sorted in alphabetic order,add an index,save to file.
In [2]:
# Read stop words from ./stopwords_en.txt file to construct stop word list
stopwords_file = ‘./stopwords_en.txt’
f = open(stopwords_file)
line = f.readline()
stopwords = []
while line:
stopwords.append(line.strip())
line = f.readline()
# Define a word segmentation function, enter a segment, use regular expression word segmentation, and remove stop words
def extract_tokens(document, stopwords):
# Implement word segmentation using regular expression matching, and turn the word into lowercase
words = re.findall(“\w+(?:[-‘]\w+)?”, document.lower())
# remove word from stop word list
return [word for word in words if word not in stopwords]
# read the txt file generated by task 1 and generate a list with each sentence as an element
def load_txt(txt_file):
f = open(txt_file)
line = f.readline()
sentence_list = []
while line:
# remove asterisk separator
if line.strip() != ‘**********’:
sentence_list.append(line.strip())
line = f.readline()
return sentence_list
# Call load_txt, read multiple txt files, generate a list of elements inside
def batch_load_txt(onlyfiles):
meeting_transcript_list = []
for txt_file in onlyfiles:
# call load_txt function to read a txt file
meeting_transcript = load_txt(txt_file)
# fatten byextend
meeting_transcript_list.extend(meeting_transcript)
return meeting_transcript_list
# input a list of multiple paragraphs to segment each paragraph in the list
def generate_segment_tokens_list(meeting_transcript_list):
# segmentation of each paragraph, a list of words after a paragraph, as a sub-list exists in a list
meeting_transcript_list_tokens = list(map(lambda x: extract_tokens(x, stopwords), meeting_transcript_list))
# initialize a dict with a default value of 0
frequency = defaultdict(int)
# Each sublist in the circular list meeting_transcript_list_tokens
for meeting_transcript_tokens in meeting_transcript_list_tokens:
for token in meeting_transcript_tokens:
# calculate the frequency of words
frequency[token] += 1
# Use the frequency calculation results to delete words with a frequency greater than 132 in meeting_transcript_list_tokens
tokens_list = [[token for token in meeting_transcript_tokens if frequency[token] <= 132] for meeting_transcript_tokens in meeting_transcript_list_tokens]
# return filter result
return tokens_list
# Use a list containing multiple paragraphs to generate a dictionary of words, key is a word, and value is an index
# Words in the vocabulary was sorted in alphabetic order
# The tokens_list input is the output of generate_segment_tokens_list
def generate_token_dict(tokens_list):
tokens_set = []
# Take out the words from each sub-list into a large list
[tokens_set.extend(tokens) for tokens in tokens_list]
# Drop duplicates words and sort
distinct_sorted_token = sorted(set(tokens_set))
# Create the index of word
token_idx = range(0, len(distinct_sorted_token))
# The word index and word are combined as a key-value pair, the key is word, and the value is index
sorted_token_set = zip(distinct_sorted_token, token_idx)
# Put key-value pairs into a dict
token_dict = dict(sorted_token_set)
return token_dict
# Save word and word index to specified file
# Input a key is a vocabulary, index is a dict of value, specify the output file
def output_vocab(vocab_dict, output_file):
# Vocab_dict turns into list and sorts
vocab_list = [(word, idx) for word, idx in vocab_dict.items()]
vocab_list_sorted = sorted(vocab_list, key=lambda x:x[0])
f = open(output_file, 'w')
for word, idx in vocab_list_sorted:
f.write("%s:%s\n" % (word,idx))
f.close()
In [ ]:
txt_files_dir = './txt_files'
onlyfiles = [join(txt_files_dir, f) for f in listdir(txt_files_dir) if (isfile(join(txt_files_dir, f))) and (splitext(f)[1] == '.txt')]
meeting_transcript_list = batch_load_txt(onlyfiles)
segment_tokens_list = generate_segment_tokens_list(meeting_transcript_list)
token_dict = generate_token_dict(segment_tokens_list)
vocab_output_file = './vocab.txt'
output_vocab(token_dict, vocab_output_file)
4. Generate the topic boundaries encoded¶
Cycle through the task1 files generated by each task1 and do the following:
1) Calculate the number of lines in the file
2) Record the position of the separator
3) Generate a vector of 0 length equal to the number of rows. Assign a value of 1 to the previous element at the separator position, indicating the topic boundary
4) Vector to string for output to file
In [3]:
#Inter a single txt fileI,output boundaries encoded in boolean vectors
def generate_topic_seg(topic_txt_file):
f = open(topic_txt_file, 'r')
line = f.readline()
count = 0
boundaries = []
# Every segments inside a looping txt file
while line:
# Asterisk does not count in the number of rows
if line.strip() != "**********":
count += 1
else:
# When an asterisk is encountered, it indicates that the topic's delimitation occurs,
# and the delimiter points are recorded in a list
boundaries.append(count)
line = f.readline()
# Create a list of 0 elements with a length equal to the number of rows
zero_list = [0] * count
# The value of the element at the separation position is 1
for i in boundaries:
zero_list[i-1] = 1
# Binary vector connected with commas
topic_seg = ",".join(map(str, zero_list))
meeting_transcript = split(topic_txt_file)[-1].replace(".txt", "")
# Add file ID, output to file
return "%s:%s" % (meeting_transcript, topic_seg)
# Loop through each file to get all the vectors
def batch_generate_topic_seg(onlyfiles):
topic_seg_list = []
for txt_file in onlyfiles:
topic_seg_list.append(generate_topic_seg(txt_file))
return topic_seg_list
# Save each file's corresponding vector into the topic_segs.txt file
def output_topic_seg(topic_seg_list, output_file):
f = open(output_file, 'w')
for topic_seg in topic_seg_list:
f.write("%s\n" %topic_seg)
f.close()
In [ ]:
topic_seg_output_file = './topic_segs.txt'
topic_seg_list = batch_generate_topic_seg(onlyfiles)
output_topic_seg(topic_seg_list, topic_seg_output_file)
5. Transform paragraphs to sparse representation¶
Each txt file generated by looping task1
1) Divide paragraphs on the content of txt files
2) Segmentation of paragraphs
3) Count the frequency of occurrence of each word in the paragraph and generate as
4) All of the in paragraph are connected by a comma and output to the result file
In [4]:
# Inter a txt file and the word dictionary generated in the first step,generates a sparse vector for each paragraph in the txt file
# indicate as word index, value as the word frequency in the paragraph appear
def sparse_txt_file(txt_file, vocab_dict):
f = open(txt_file, 'r')
# Fragment file content by asterisk separator
line = f.readline()
lines = []
while line:
lines.append(line.strip())
line = f.readline()
lines.pop(-1)
paragraph_list = " ".join(lines).strip().split("**********")
# Segment each paragraph and remove stop words
paragraph_list_tokens = map(lambda x: extract_tokens(x, stopwords), paragraph_list)
# Convert the word to get a list with the form [ indicate:value, ...]
paragraph_sparse_rep = map(lambda x: trans_tokens_to_index(x, vocab_dict), paragraph_list_tokens)
return paragraph_sparse_rep
# Use the word dictionary to transform the word into index, and at the same time, count the frequency of
# occurrence of each word in the corresponding paragraph.
def trans_tokens_to_index(tokens_list, vocab_dict):
# Convert only words that exist in the word dictionary
tokens_index_list = [vocab_dict.get(token) for token in tokens_list if token in vocab_dict]
# How often each word appears in the paragraph
counts = Counter(tokens_index_list)
# word as indicate, freq as value, generates a list of elements named:value
sparse_vec = ["%s:%s" % (token, freq) for token, freq in counts.items()]
return ",".join(sparse_vec)
# Loops each paragraph's corresponding sparse vector into the result file
def output_sparse_txt(paragraph_sparse_rep, output_file):
f = open(output_file, 'w')
for sparse_rep in paragraph_sparse_rep:
f.write("%s\n" % sparse_rep)
f.close()
def batch_output_sparse_txt(txt_file_list, token_dict, output_dir):
# Loop through each txt file
for topic_txt_file in txt_file_list:
# Output file name
sparse_rep_output_file = join(output_dir, split(topic_txt_file)[-1])
# Call sparse_txt_file to extract sparse vector of a txt file
paragraph_sparse_rep = sparse_txt_file(topic_txt_file, token_dict)
# Output sparse vector into result file
output_sparse_txt(paragraph_sparse_rep, sparse_rep_output_file)
In [7]:
sparse_rep_output_dir = './sparse_files'
batch_output_sparse_txt(onlyfiles, token_dict , sparse_rep_output_dir)