# q1 def is_even(nbr): return nbr % 2 == 0 def sum_even(lst): count = 0 for n in lst: if is_even(n): count += n return count def Q1(): print(sum_even([7, 8, 3, 7, 6, 1])) # q2 def find_longest_word(filename): with open(filename) as f: content = f.read() content = content.split("\n") longest_word = "" #for word in content: # if len(word) > len(longest_word): # longest_word = word #return longest_word return max(content, key=len) def Q2(): print(find_longest_word('q2_testfile.txt')) def ttr(words): Nt = len(set(words)) Nw = len(words) return Nt/Nw # q3 def Q3(): print(ttr(['rose', 'is', 'a', 'rose', 'is', 'a', 'rose', 'is', 'a', 'rose'])) # q4 def lix_score(text): Ns = len(text) Nw = 0 Nlw = 0 for sent in text: Nw += len(sent) for word in sent: if len(word) > 6: Nlw += 1 lix = Nw / Ns + 100 * Nlw / Nw return lix def Q4(): doc = [ ['The', 'hedgehog', 'lives', 'in', 'the', 'barn'], ['His', 'name', 'is', 'Oscar'] ] print(lix_score(doc)) # q5 class SpamFilter(object): def guess(self, email): if "Hi" in email: return False else: return True def evaluate(self, ts): n = len(ts) nTT = 0 nFF = 0 nFT = 0 for truth_value, email in ts: if truth_value and self.guess(email): nTT += 1 if not truth_value and not self.guess(email): nFF += 1 if not truth_value and self.guess(email): nFT += 1 acc = (nTT + nFF) / n fpr = nFT / (nFT + nFF) return acc, fpr def Q5(): test_set = [(True, "Buy more prescription DRUGS CHEAP!!!! !"), (False, "Hi, how are you? How did the exam go?"), (False, "Dear student, Here is the result of your exam in Formal Linguistics."), (True, "Dear Madam, would you like to win $1000000000000?")] sf = SpamFilter() print(sf.evaluate(test_set)) #Q5() # q6 def frequencies(words): d = {} for word in words: if word in d: d[word] += 1 else: d[word] = 1 return d from collections import Counter def frequencies2(words): d = Counter(words) return d def dot(d1, d2): f1 = frequencies(d1) f2 = frequencies(d2) dot = 0 for word in f1: if word in f2: dot += f1[word]*f2[word] return dot from math import sqrt def cos_sim(d1, d2): return dot(d1, d2) / (sqrt(dot(d1, d1))*sqrt(dot(d2, d2))) def most_similar(d, n, docs): results = [] for doc in docs: sim = cos_sim(d, doc) results.append( (sim, doc) ) results.sort(reverse=True) return results[:n] def Q6(): d1 = ['apples', 'apples', 'oranges', 'apples', 'apples', 'apples'] d2 = ['apples', 'oranges', 'apples', 'oranges', 'oranges'] d3 = ['oranges', 'apples', 'oranges', 'oranges', 'oranges'] docs = [d1, d2, d3] print(most_similar(d1, 2, docs)) Q6()