import math import nltk from collections import Counter # functions to read the corpus def read_tagged_sentence(f): line = f.readline() if not line: return None sentence = [] while line and (line != '\n'): line = line.strip() word, tag = line.split('\t', 2) sentence.append( (word, tag) ) line = f.readline() return sentence def read_tagged_corpus(filename): sentences = [] with open(filename, encoding='utf-8') as f: sentence = read_tagged_sentence(f) while sentence: sentences.append(sentence) sentence = read_tagged_sentence(f) return sentences # baseline using NLTK def find_most_common_tag(tagged_sentences): # make a frequency table of the tags occurring in the sentences tag_counter = Counter(t for s in tagged_sentences for _, t in s) # find the tag with the highest frequency, typically noun return tag_counter.most_common(1)[0][0] def train_nltk_baseline(tagged_sentences): # find the most common tag most_common_tag = find_most_common_tag(tagged_sentences) # make a backoff tagger that always outputs that tag backoff_tagger = nltk.DefaultTagger(most_common_tag) # finally, build a unigram tagger from the corpus return nltk.UnigramTagger(tagged_sentences, backoff=backoff_tagger) # skeleton for the bigram tagger code def hmm_train_tagger(tagged_sentences): # estimate the emission and transition probabilities # return the two probability tables pass def hmm_tag_sentence(tagger_data, sentence): # apply the Viterbi algorithm # then retrace your steps # finally return the list of tagged words pass START = "" END = "" def viterbi(tagger_data, sentence): # make a dummy link with log prob of 0, START tag, and no predecessor # current list = [ the dummy link ] # for each word in the sentence: # previous list = current list # current list = [] # determine the possible tags for this word # # for each tag of the possible tags: # determine the emission log probability of the word for this tag # find the predecessor that gives the highest total log probability, # where the total log probability is the sum of # 1) the emission log probability, # 2) the log probability of the transition from the tag of the # predecessor to the current tag, # 3) the total log probability of the predecessor # make a new link with this tag and add it to the current list # end the sequence with a dummy: the highest-scoring link with the tag END pass def retrace(end_link, sentence_length): # tags = [] # link = predecessor of end_link # while the tag of the link isn't START: # add the tag of the link to tags # link = predecessor of link # reverse the list of tags and return it pass all_sentences = read_tagged_corpus(YOUR_CORPUS) # divide the sentences into a training and a test part # train the bigram tagger # train the baseline tagger # evaluate the bigram tagger and the baseline