02-bpe
Train BPE on a toy text example
bpe algorithm: https://web.stanford.edu/~jurafsky/slp3/2.pdf (2.4.3)
In [ ]:
import re, collections
text = “The aims for this subject is for students to develop an understanding of the main algorithms used in naturallanguage processing, for use in a diverse range of applications including text classification, machine translation, and question answering. Topics to be covered include part-of-speech tagging, n-gram language modelling, syntactic parsing and deep learning. The programming language used is Python, see for more information on its use in the workshops, assignments and installation at home.”
# text = ‘low ‘*5 +’lower ‘*2+’newest ‘*6 +’widest ‘*3
def get_vocab(text):
vocab = collections.defaultdict(int)
for word in text.strip().split():
#note: we use the special token (instead of underscore in the lecture) to denote the end of a word
vocab[‘ ‘.join(list(word)) + ‘ ‘] += 1
return vocab
“””
This function iterates through all words in the vocabulary and count pair of tokens which are next to each other.
EXAMPLE:
word = ‘T h e <\w>‘
pairs of tokens in this word [(‘T’, ‘h’), (‘h’, ‘e’), (‘e’, ‘<\w>‘)]
INPUT:
vocab: Dict[str, int] # The vocabulary, a counter for word frequency
OUTPUT:
pairs: Dict[Tuple[str, str], int] # Word pairs, a counter for pair frequency
“””
def get_stats(vocab):
pairs = collections.defaultdict(int)
###
# Your answer BEGINS HERE
###
###
# Your answer ENDS HERE
###
return pairs
“””
This function merges a given pair of tokens in all words in the vocabulary
EXAMPLE:
word = ‘T h e <\w>‘
pair = (‘e’, ‘<\w>‘)
word_after_merge = ‘T h e<\w>‘
Input:
pair: Tuple[str, str] # the pair of tokens need to be merged
v_in: Dict[str, int] # vocabulary before merge
Output:
v_out: Dict[str, int] # vocabulary after merge
HINT:
When merging pair (‘h’, ‘e’) for word ‘Th e<\w>‘, the two tokens in this word ‘Th’ and ‘e<\w>‘ shouldn’t be merged.
“””
def merge_vocab(pair, v_in):
v_out = {}
###
# Your answer BEGINS HERE
###
###
# Your answer ENDS HERE
###
return v_out
def get_tokens(vocab):
tokens = collections.defaultdict(int)
for word, freq in vocab.items():
word_tokens = word.split()
for token in word_tokens:
tokens[token] += freq
return tokens
vocab = get_vocab(text)
print(“Vocab =”, vocab)
print(‘==========’)
print(‘Tokens Before BPE’)
tokens = get_tokens(vocab)
print(‘Tokens: {}’.format(tokens))
print(‘Number of tokens: {}’.format(len(tokens)))
print(‘==========’)
#about 100 merges we start to see common words
num_merges = 100
for i in range(num_merges):
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
new_token = ”.join(best)
vocab = merge_vocab(best, vocab)
print(‘Iter: {}’.format(i))
print(‘Best pair: {}’.format(best))
# add new token to the vocab
tokens[new_token] = pairs[best]
# deduct frequency for tokens have been merged
tokens[best[0]] -= pairs[best]
tokens[best[1]] -= pairs[best]
print(‘Tokens: {}’.format(tokens))
print(‘Number of tokens: {}’.format(len(tokens)))
print(‘==========’)
After training, used the BPE dictionaries to tokenise sentences
In [ ]:
def get_tokens_from_vocab(vocab):
tokens_frequencies = collections.defaultdict(int)
vocab_tokenization = {}
for word, freq in vocab.items():
word_tokens = word.split()
for token in word_tokens:
tokens_frequencies[token] += freq
vocab_tokenization[”.join(word_tokens)] = word_tokens
return tokens_frequencies, vocab_tokenization
def measure_token_length(token):
if token[-4:] == ‘‘:
return len(token[:-4]) + 1
else:
return len(token)
def tokenize_word(string, sorted_tokens, unknown_token=’‘):
if string == ”:
return []
if sorted_tokens == []:
return [unknown_token] * len(string)
string_tokens = []
# iterate over all tokens to find match
for i in range(len(sorted_tokens)):
token = sorted_tokens[i]
token_reg = re.escape(token.replace(‘.’, ‘[.]’))
matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
# if no match found in the string, go to next token
if len(matched_positions) == 0:
continue
# collect end position of matches in the string
substring_end_positions = [matched_position[0] for matched_position in matched_positions]
substring_start_position = 0
for substring_end_position in substring_end_positions:
# slice for sub-word
substring = string[substring_start_position:substring_end_position]
# tokenize this sub-word with tokens remaining
string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
string_tokens += [token]
substring_start_position = substring_end_position + len(token)
# tokenize the remaining string
remaining_substring = string[substring_start_position:]
string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
break
else:
# return list of unknown token if no match is found for the string
string_tokens = [unknown_token] * len(string)
return string_tokens
“””
This function generates a list of all tokens sorted by their length (1st key) and frequency (2nd key).
EXAMPLE:
token frequency dictionary before sorting: {‘natural’: 3, ‘language’:2, ‘processing’: 4, ‘lecture’: 4}
sorted tokens: [‘processing’, ‘language’, ‘lecture’, ‘natural’]
INPUT:
token_frequencies: Dict[str, int] # Counter for token frequency
OUTPUT:
sorted_token: List[str] # Tokens sorted by length and frequency
“””
def sort_tokens(tokens_frequencies):
###
# Your answer BEGINS HERE
###
###
# Your answer ENDS HERE
###
return sorted_tokens
#display the vocab
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
#sort tokens by length and frequency
sorted_tokens = sort_tokens(tokens_frequencies)
print(“Tokens =”, sorted_tokens, “\n”)
sentence_1 = ‘I like natural language processing!’
sentence_2 = ‘I like natural languaaage processing!’
sentence_list = [sentence_1, sentence_2]
for sentence in sentence_list:
print(‘==========’)
print(“Sentence =”, sentence)
for word in sentence.split():
word = word + “”
print(‘Tokenizing word: {}…’.format(word))
if word in vocab_tokenization:
print(vocab_tokenization[word])
else:
print(tokenize_word(string=word, sorted_tokens=sorted_tokens, unknown_token=’‘))
In [ ]: