;; The first three lines of this file were inserted by DrRacket. They record metadata
;; about the language level of this file in a form that our tools can easily process.
#reader(lib “2017-fall-reader.rkt” “csc104”)((modname authorship) (compthink-settings #hash((prefix-types? . #f))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; In this project you will practice manipulating lists by measuring some
;;; statistics from lists of words. At the very end of the file we provide
;;; some *loooong* lists of words (books) to experiment with, in order to
;;; see whether the statistics can distinguish between works by
;;; Jane Austen, Charles Dodgeson, Charles Dickens, and the brothers Grimm.
;;;
;;; Your job is to complete the code below. Everywhere you find XXX, there
;;; is either a check-expect expression or a function to implement or fix
;;; up.
;;;
;;; Work from top to bottom. Get one thing working before you move to the
;;; next. One good technique is to use the expression comment to disable
;;; the check-expects you are not yet ready to deal with, for example:
;;;
#;(check-expect (f x) 17)
;;;
;;; Now you can click “Run” without seeing the output from many, many
;;; check-expects.
;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Some useful constant definitions…
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; punctuation is a string containing punctuation we
; expect to encounter
(define punctuation “!\”‘,;:.-?)([]<>*#\n\t\r”)
; end-of-sentence punctuation is a string containing
; punctuation we would expect at the end of a sentence
(define terminal-punctuation “.!?”)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Some useful definitions for string manipulation…
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; chop-last-character : string -> string
; Produce s with last character removed, assuming s has
; at least one character.
(check-expect (chop-last-character “0”) “”)
(check-expect (chop-last-character “01”) “0”)
(check-expect (chop-last-character “five”)
(substring “five” 0
(- (string-length “five”)
1)))
;;; XXX use the check-expect above to fix chop-last-character
;;; definition below
(define (chop-last-character s)
“”)
; last-character : string -> string
; Return the last character of s. Assume that s
; is at least 1 character long
(check-expect (last-character “{anf..!”) “!”)
;;; XXX write a full-design check-expect for (last-character “012”)
;;; that uses substring and string-length to produce the
;;; last character of “012”
;;; XXX use your check-expect (above) as a guide to fixing
;;; the body of the definition of last-character
(define (last-character s)
“”)
; strip-trailing-punctuation : string -> string
; Produce a new string with all of trailing punction
; removed from s.
(check-expect (strip-trailing-punctuation “one-man-band..!”) “one-man-band”)
(define (strip-trailing-punctuation s)
(cond
;;; XXX question/answer pair for s being an empty string
[#true “empty”]
;;; XXX question/answer pair if the last character of s is punctuation
[#true “!#”]
[else “”]))
; strip-leading-punctuation : string -> string
; Produce a new string with all leading punctuation removed from s.
(check-expect (strip-leading-punctuation “[->#one-man”) “one-man”)
;;; XXX imitate strip-trailing-punctuation (above) to fix
;;; strip-leading-punctuation (below)
(define (strip-leading-punctuation s)
(cond
[#true “”]
[#true “”]
[else “”]))
; non-empty-string? : string -> bool
; Return #true if s is non-empty, #false otherwise
(check-expect (non-empty-string? “”) #false)
(check-expect (non-empty-string? ” “) #true)
;;; XXX produce a full-design check-expect for (non-empty-string? “1”)
;;; by using an expression that compares the length of “1” to 0
;;; Then fix function non-empty-string? (below)
(define (non-empty-string? s)
#false)
; sanitize : string -> string
; Produce a new string from s with all leading and trailing punctuation
; stripped and all upper-case characters in lower case;
(check-expect (sanitize “[->#oNe:Man-BAnd<-]”) “one:man-band”)
;;; Here’s a full-design check-expect for sanitize
(check-expect (sanitize “-AbC:dE-“)
(strip-leading-punctuation
(strip-trailing-punctuation
(string-lower-case “-AbC:dE-“))))
;;; XXX Use the check-expect (above) to fix the definition of
;;; sanitize (below)
(define (sanitize s)
“”)
; unit-terminator? string string -> boolean
; Return whether s terminates a textual unit
; based on whether it contains any of the terminal punctuation
; in t.
(check-expect (unit-terminator? “the end” “.!?”) #false)
(check-expect (unit-terminator? “fin!” “.!?”) #true)
(define (unit-terminator? s t)
(cond
[(zero? (string-length s)) #false]
;;; XXX else/answer pair using lists-intersect? on s and t
;;; turned into lists of length-1 strings, using string->list
[else “false”]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; some useful functions for list, list-of-string,
;;; and list-of-list-of-string
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; lists-intersect? : list list -> boolean
; Does list-1 have any elements in common with list-2?
(check-expect (lists-intersect? (list) (list 5)) #false)
(check-expect (lists-intersect? (list 1 5) (list 5 6)) #true)
(define (lists-intersect? list-1 list-2)
(cond
;;; XXX question/answer for empty list-1
[#true #false]
;;; XXX else either the first element of list-1 is member? list-2 or
;;; the rest of list-1 intersects list-2
[else #false]))
; unit+word : list-of-string string -> list-of-string
; Produce new list-of-string by appending word to unit.
(check-expect (unit+word (list “my” “dog”) “hAs.!”)
(list “my” “dog” “hAs.!”))
(check-expect (unit+word (list “one” “two”) “three”)
(append (list “one” “two”) (list “three”)))
;;; XXX use the full-design check-expect above to guide you
;;; in fixing unit+word below
(define (unit+word unit word)
(list “”))
; unit-list+unit : list-of-list-of-string list-of-string
; -> list-of-list-of-string
; Produce a new units, by appending new-unit.
(check-expect (unit-list+unit
(list (list “what” “is”) (list “up” “now”))
(list “doc” “”))
(append (list (list “what” “is”) (list “up” “now”)) (list (list “doc” “”))))
;;; XXX use the full-design check-expect above to guide you in fixing
;;; unit-list+unit below
(define (unit-list+unit units new-unit)
(list (list “”)))
; word-list->text-unit-list :
; list-of-string list-of-string list-of-list-of-string string
; -> list-of-list-of-string
; Produces units, a list of text units, from words, a list of
; words, and next-unit containing the next unit to add to units,
; where terminators contains the punctuation that terminates a unit.
(check-expect (word-list->text-unit-list
(list “My” “dOg,” “haS” “.” “fleas!” “Okay”)
(list) (list) “.?!”)
(list (list “My” “dOg,” “haS” “.”) (list “fleas!”) (list “Okay”)))
(define (word-list->text-unit-list words next-unit units terminators)
(cond
; no more words, next-unit already added
;;; XXX question/answer if both next-unit and words are empty produce units
[#true (list (list “”))]
; no more words, next unit terminated by running out of words
;;; XXX question/answer if just words is empty add next-unit to units
[#true (list (list “”))]
; next unit terminated by word containing terminal punctuation
[(unit-terminator? (first words) terminators)
(word-list->text-unit-list
(rest words)
(list)
(unit-list+unit units (unit+word next-unit (first words))) terminators)]
; next sentence is still being built
[else
(word-list->text-unit-list
(rest words) (unit+word next-unit (first words)) units terminators)]))
; word-roster : list-of-string -> list-of-string
; Return a sorted list of unique words in words
(check-expect (word-roster (list “my” “dOg” “has” “fleas” “he” “has”) (list))
(list “dog” “fleas” “has” “he” “my”))
(define (word-roster words roster)
(cond
[(empty? words) (sort roster string<?)]
[else
(word-roster
(rest words)
(cond
[(member? (sanitize (first words)) roster) roster]
[else (list* (sanitize (first words)) roster)]))]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; statistics on lists of words
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; average-word-length : list-of-string -> natural
; Return the average length of standardized words in word-list
(check-expect (average-word-length (list “[!one” “two!!” “three…?”)) 11/3)
(define (average-word-length word-list)
(local
[(define sanitized-word-list
(filter non-empty-string? (map sanitize word-list)))]
(cond
[(zero? (length word-list)) 0]
[else
(/ (apply + (map string-length sanitized-word-list))
(length sanitized-word-list))])))
; average-sentence-length : list-of-string -> number
; Return the average number of words in each sentence of words.
(check-expect
(average-sentence-length (list “My” “dOg,” “haS” “.” “fleas!\”]” “Okay”))
5/3)
(define (average-sentence-length words)
(local [(define sentence-list (word-list->text-unit-list words (list) (list) “.!?”))
(define word-list (filter non-empty-string? (map sanitize words)))]
;;; XXX divide the length of word list by length of sentence-list
0))
; type-token-ratio : list-of-string -> number
; Return the ratio of unique words to total words
(check-expect (type-token-ratio
(list “my” “dog” “has” “fleas” “he” “has”))
5/6)
(define (type-token-ratio words)
;;; XXX divide length of word-roster of words by the number of sanitized
;;; words in words
(/ (length (word-roster words (list)))
0))
; hapax-legomena : list-of-string -> list-of-string
; Produce a list of words that occur exactly once
; from word-list. Assume word-list is sanitized and
; non-empty.
(check-expect (hapax-legomena
(list “one” “two” “one” “three” “one”) (list) (list))
(list “three” “two”))
(define (hapax-legomena word-list seen-once seen-twice)
(cond
[(empty? word-list)
(local [(define (not-twice w) (not (member? w seen-twice)))]
(filter not-twice seen-once))]
[(member? (first word-list) seen-once)
(hapax-legomena
(rest word-list) seen-once (list* (first word-list) seen-twice))]
[else
(hapax-legomena
(rest word-list) (list* (first word-list) seen-once) seen-twice)]))
; hapax-legomena-ratio : list-of-string -> number
; Return the ratio of the number of words that occur exactly once
; over the number of distinct words in word-list
(check-expect (hapax-legomena-ratio (list “one” “two” “one” “three” “one”))
2/3)
(define (hapax-legomena-ratio word-list)
;;; XXX divide the length of hapax-legoma of sanitized word list with all “” removed
;;; by the length of the word-roster of word-list
0)
; average-sentence-complexity : list-of-string -> number
; Return the average number of clauses per sentence in word-list
(check-expect (average-sentence-complexity
(list “The” “time” “has” “come,” “the”
“Walrus” “said” “To” “talk” “of”
“many” “things:” “of” “shoes” “-”
“and” “ships” “-” “and” “sealing”
“wax,” “Of” “cabbages;” “and kings.”
“And” “why” “the” “sea” “is” “boiling”
“hot;” “and” “whether” “pigs” “have”
“wings”))
3.5)
(define (average-sentence-complexity words)
(local
[(define sentences (word-list->text-unit-list words (list) (list) “.!?”))
(define (clausify s) (word-list->text-unit-list s (list) (list) “;,:”))
(define clauses (map clausify sentences))]
;;; XXX divide the total length of clauses by the length of sentences
0))
; text-signature : list-of-string -> list-of-number
; Produce a list of average-word-length,
; average-sentence-length, type-token-ratio,
; hapax-legomena-ratio, and average-sentence-complexity
; from words
(check-expect (text-signature
(list “The” “time” “has” “come,” “the”
“Walrus” “said” “To” “talk” “of”
“many” “things:” “of” “shoes” “-”
“and” “ships” “-” “and” “sealing”
“wax,” “Of” “cabbages;” “and kings.”
“And” “why” “the” “sea” “is” “boiling”
“hot;” “and” “whether” “pigs” “have”
“wings”))
(list 139/34 17 14/17 6/7 3.5))
(define (text-signature words)
;;; XXX list of average word length, average sentence length,
;;; type-token ratio, hapax-legomena-ratio, average sentence complexity
(list 0 0 0 0 0))
; text-signature-difference : list-of-string list-of-string -> number
; Report how different words1 and words2 based on
; feature-weights.
(check-expect (text-signature-difference
(list “The” “time” “has” “come,” “the”
“Walrus” “said” “To” “talk” “of”
“many” “things:” “of” “shoes” “-”
“and” “ships” “-” “and” “sealing”
“wax,” “Of” “cabbages;” “and kings.”
“And” “why” “the” “sea” “is” “boiling”
“hot;” “and” “whether” “pigs” “have”
“wings”)
(list “The” “time” “has” “come,” “the”
“Walrus” “said” “To” “talk” “of”
“many” “things:” “of” “shoes” “-”
“and” “ships” “-” “and” “sealing”
“wax,” “Of” “cabbages;” “and kings.”
“And” “why” “the” “sea” “is” “boiling”
“hot;” “and” “whether” “pigs” “have”
“wings”)
(list 1 1 1 1 1))
1)
(check-expect (text-signature-difference
(list “my” “dOg” “has” “fleas” “he” “has”)
(list “The” “time” “has” “come,” “the”
“Walrus” “said” “To” “talk” “of”
“many” “things:” “of” “shoes” “-”
“and” “ships” “-” “and” “sealing”
“wax,” “Of” “cabbages;” “and kings.”
“And” “why” “the” “sea” “is” “boiling”
“hot;” “and” “whether” “pigs” “have”
“wings”)
(list 1 1 1 1 1))
32692/49623)
(define (text-signature-difference words1 words2 feature-weights)
(local
[(define (absolute-ratio x)
(cond
[(<= x 1) x]
[else (/ 1 x)]))]
(/ (apply +
(map absolute-ratio
(map /
(text-signature words1)
(text-signature words2))))
(apply + feature-weights))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; book-length lists of strings to experiment with
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; batch-io provides function read-words to read text from computer files
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;(require 2htdp/batch-io)
;(define mystery0 (read-words “mystery_files/mystery0.txt”))
;(define mystery1 (read-words “mystery_files/mystery1.txt”))
;(define mystery2 (read-words “mystery_files/mystery2.txt”))
;(define mystery3 (read-words “mystery_files/mystery3.txt”))
;(define mystery4 (read-words “mystery_files/mystery4.txt”))
;(define mystery5 (read-words “mystery_files/mystery5.txt”))