# -*- coding: UTF-8 -*-
“””
Universität Tübingen – Seminar für Sprachwissenschaft
VL ‘Programming and Data Analysis’ WS 2019-2020
© Johannes Dellert, Gerhard Jäger
Assignment 07: Analyzing the Spanish Copulas
Template
“””
# all inflected forms of ‘ser’
conj_ser = {“sersiendo”, “sido”, “sidasidos”, “sidas”, “soy”, “eres”, “sos”, “es”, “somos”, “sois”, “son”, “era”,
“eras”, “era”, “éramos”, “erais”, “eran”, “fui”, “fuiste”, “fue”, “fuimos”, “fuisteis”, “fueron”, “seré”,
“serás”, “será”, “seremos”, “seréis”, “serán”, “sería”, “serías”, “sería”, “seríamos”, “seríais”, “serían”,
“sea”, “seas”, “sea”, “seamos”, “seáis”, “sean”, “fuera”, “fueras”, “fuera”, “fuéramos”, “fuerais”,
“fueran”, “fuese”, “fueses”, “fuese”, “fuésemos”, “fueseis”, “fuesen”, “fuere”, “fueres”, “fuere”,
“fuéremos”, “fuereis”, “fueren”, “sé”, “sea”, “seamos”, “sed”, “sean”}
# all inflected forms of ‘estar’
conj_estar = {“estar”, “estando”, “estado”, “estada”, “estados”, “estadas”, “estoy”, “estás”, “está”, “estamos”,
“estáis”, “están”, “estaba”, “estabas”, “estaba”, “estábamos”, “estabais”, “estaban”, “estuve”,
“estuviste”, “estuvo”, “estuvimos”, “estuvisteis”, “estuvieron”, “estaré”, “estarás”, “estará”,
“estaremos”, “estaréis”, “estarán”, “estaría”, “estarías”, “estaría”, “estaríamos”, “estaríais”,
“estarían”, “esté”, “estés”, “esté”, “estemos”, “estéis”, “estén”, “estuviera”, “estuvieras”, “estuviera”,
“estuviéramos”, “estuvierais”, “estuvieran”, “estuviese”, “estuvieses”, “estuviese”, “estuviésemos”,
“estuvieseis”, “estuviesen”, “estuviere”, “estuvieres”, “estuviere”, “estuviéremos”, “estuviereis”,
“estuvieren”, “está”, “esté”, “estemos”, “estad”, “estén”}
# Task 1
def load_sentences(filename):
“””Load sentences with POS tag from file.
:param filename: name of the file with the sentences, one sentence per line.
:type filename: str
:return: list of the sentences from the file, each represented by a list of (form,pos) tuples
:rtype: list[list[tuple[str, str]]]
“””
with open(filename, encoding=”utf-8″) as f: # ensure file gets closed and exceptions handled
sentences_list = [] # create list to be returned
for line in f.readlines():
l = line.strip().split(” “) #get rid of newline symbol and split at space to get separate words
words = [tuple(item.split(“_”)) for item in l] # split words further into word and tag. turn the list into tuple for correct output
sentences_list.append(words)
return sentences_list
# Task 2
def lemmatize(adj):
“””Naively lemmatize Spanish adjectives.
:param adj: the adjective to be lemmatized
:type adj: str
:return: the lemma form of the adjective
:rtype: str
“””
if adj.endswith(“esa”):
return adj[:-3]+”és”
elif adj.endswith(“a”):
return adj[:-1]+”o”
elif adj.endswith(“esas”) or adj.endswith(“eses”):
return adj[:-4]+”és”
elif adj.endswith(“as”) or adj.endswith(“os”):
return adj[:-2]+”o”
elif adj.endswith(“ntes”) or adj.endswith(“nses”) or adj.endswith(“bles”) or adj.endswith(“bres”):
return adj[:-1]
elif adj.endswith(“les”) or adj.endswith(“res”) or adj.endswith(“nes”):
return adj[:-2]
elif adj.endswith(“ces”):
return adj[:-3]+”z”
elif adj.endswith(“es”): # need to check distinctive cases ending with -es first so theyre not overwritten by this
return adj[:-1]
else:
return adj
# Task 3
def count_occurrences(sentences):
“””Count occurrences of adjectives as complements to forms of “ser” and “estar”.
:param sentences: a list of sentences, each represented by a list of (form,pos) tuples
:type sentences: list[list[tuple[str, str]]]
:return: two dictionaries storing the counts of adjective lemmas following forms of ser and estar in the sentences
:rtype: tuple[dict[str, int], dict[str, int]]
“””
# create the dictionaries to return
freq_ser = dict()
freq_estar = dict()
for sentence in sentences: # go through all sentences
for word in sentence: # go through each item of sentences
if len(word) == 2 and word[1] == “ADJ”: # if the word is a 2 length tuple and adjective
#print(word[0])
aux = sentence[sentence.index(word)-1] # check position before adjective
if len(aux) == 2 and aux[1] == “AUX”: # check if it is a 2 length tuple and aux
lemma = lemmatize(word[0]) # lemmatize words
if not lemma in freq_ser:
freq_ser[lemma] = 0 # if the lemma isn’t in dict yet assign occurence 0
if aux[0] in conj_ser: # otherwise add 1 to frequency
freq_ser[lemma] += 1
if not lemma in freq_estar:
freq_estar[lemma] = 0
if aux[0] in conj_estar:
freq_estar[lemma] += 1
with open(“test2.txt”, “w”, encoding=”utf8″) as f:
for frequency in freq_estar:
f.write(frequency + “\n”)
return (freq_ser,freq_estar)
# Task 4
def get_occurrence_sets(freq_ser, freq_estar):
“””Extract a partition of adjectives by well-attested co-occurrence with “ser” and “estar”.
:param freq_ser: a dictionary mapping
adjective lemmas into the number of times each adjective occurred after forms of “ser”
:type freq_ser: dict[str, int]
:param freq_estar: a dictionary mapping
adjective lemmas into the number of times each adjective occurred after forms of “ser”
:type freq_estar: dict[str, int]
:return: Tuple of three sets (ser, estar, both)
partitioning the adjectives with frequency >= 10 into the copulas they are attested with more than once.
:rtype: tuple[set[str], set[str], set[str]]
“””
ser_adj = set()
estar_adj = set()
both_adj = set()
for item in freq_ser:
if freq_ser[item]+freq_estar[item] > 9:
if freq_ser[item] > 1 and freq_estar[item] < 2:
ser_adj.add(item)
if freq_estar[item] > 1 and freq_ser[item] < 2:
estar_adj.add((item))
if freq_estar[item] >1 and freq_ser[item] >1:
both_adj.add(item)
else:
pass
#for item in freq_estar:
#print(item)
return (ser_adj, estar_adj, both_adj)
# Task 5 (Bonus)
def search_bibliography(target, contents):
“””Search a LaTeX bibliogaphy for targets using regular expressions.
:param target: the target so search for: titles, authors, years,pages, colltitles,
or the transformation to perform: transform_headings, transform_names
:type target: str
:param contents: the contents of the biliography
:type contents: str
:return: a list of matching substrings
:rtype: list[str]
“””
import regex as re
if not target.startswith(“transform_”):
pattern = “”
if target == “titles”:
# short (w/o comments):
# pattern = r”\)\.([\w\s#:,;-]+)\.”
# long version (w comments):
pattern = r”””
\)\.\s # paper title begins after ‘). ‘ of year information
( # beginning of the one group we’re interested in
[\w\s+:,;-]+ # title is concat. of word characters, whitespace and some punct. symbols
) # end of the one group we’re interested in
\. # paper title ends before ‘.’
“””
elif target == “authors”:
# short (w/o comments):
# pattern = r”\\bibitem\{w*\} \s(.+)\s\(”
# long version (w comments):
pattern = r”””
\\bibitem\{\w*\}\s # author begins after ‘\bibitem{key} ‘ (where ‘key’ is concat. of word ch.)
( # beginning of the one group we’re interested in
.+ # author is concatenation of any symbols, at least one
) # end of the one group we’re interested in
\s\( # author ends before ‘ (‘ of year information
“””
elif target == “years”:
pattern = r”\d{4}” # TODO fill in the regular expression between the double quotes
elif target == “pages”:
pattern = r”\d+\-\-\d+” # TODO fill in the regular expression between the double quotes
elif target == “colltitles”:
pattern = r”In\s\{\\it\s(.*)\}” # TODO fill in the regular expression between the double quotes
return re.findall(pattern, contents, re.X)
else:
pattern = “”
substitute = “”
if target == “transform_headings”:
# short vesion (w/o comments):
pattern = r”(\\bibitem\{\w+\})”
# long version (w comments):
# pattern = r”””
# ( # beginning of the 1st group (= heading)” \
# \\bibitem\{\w+\} # bibitem heading is ‘\bibitem{key} ‘ (where key is a concat. of word ch.s)” \
# ) # end of the 1st group (= heading)
# \s # after 1st group (= heading) comes a whitespacse
# “””
substitute = r”\n\1\n” # insert a newline symbol, then the 1st group (= the heading), then another \n
elif target == “transform_names”:
# short vesion (w/o comments):
pattern = r”(\w+),(\s)([A-Z]\.)” # TODO fill in the regular expression between the double quotes
substitute = r”\3\2\1″ # TODO fill in the regular expression substitution string between the double quotes
return re.sub(pattern, substitute, contents, re.X)
if __name__ == ‘__main__’:
sentences = load_sentences(“spanish_tagged_spacy.txt”) # load the tagged sentence corpus
#print(sentences[0:10])
#count_occurrences(sentences)
freq_ser, freq_estar = count_occurrences(sentences) # count the occurrences of adjectives with each copula
#print(get_occurrence_sets(freq_ser,freq_estar))
ser , estar , both = get_occurrence_sets ( freq_ser , freq_estar )
print(sorted(estar)[:5])
#ser, estar, both = get_occurrence_sets(freq_ser, freq_estar) # get the co-occurrence sets of adjs. for each copula
#print(“ser:\n====\n” + “\n”.join(sorted(ser)) + “\n”) # print the adjectives that only go with “ser”
#print(“estar:\n====\n” + “\n”.join(sorted(estar)) + “\n”) # print the adjectives that only go with “estar”
#print(“both:\n====\n” + “\n”.join(sorted(both)) + “\n”) # print the adjectives that can go with both copulas
“””
# Bonus task
with open(“bibliography.tex”) as bib_file:
file_contents = bib_file.read()
print(search_bibliography(“titles”, file_contents))
print(search_bibliography(“authors”, file_contents))
print(search_bibliography(“years”, file_contents))
print(search_bibliography(“pages”, file_contents))
print(search_bibliography(“colltitles”, file_contents))
print(search_bibliography(“transform_headings”, file_contents))
print(search_bibliography(“transform_names”, file_contents))
“””