#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Author: Stian Rødven Eide try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET import re import os import sys import codecs import itertools as it import bz2 def combination_to_out(combination, savelex): '''This function takes a single combination and fills the savelex dictionary with its items, making sure each MWE only occurs once. If savelex is already filled, combination can be an empty list. Returns the lexes as a list.''' # The combination might be an empty list, in which case we assume # that savelex already have all lexes. We then simply return # the values of savelex as a list if not combination: lexes_out = [savelex[l] for l in savelex] return lexes_out # Combinations may be tuples, so we make sure they are not combination = list(combination) lexes_out = [] # We go through all the word positions in savelex and fill those # that are missing for i in sorted(savelex): if savelex[i] == None: # if combination[0] has a colon pointing to a position and # that position is filled by the lex referred to, remove # the savelex instead of setting it to combination[0]. This # is how we contract the MWE if ':' in combination[0]: # We set ref_post to the number after the colon. If the colon # signifies something else, we save and continue try: ref_post = re.match('(.+:)(\d+)', combination[0]).group(2) except AttributeError: savelex[i] = combination[0] combination.pop(0) continue # We see whether it matches whatever it refers to if (int(ref_post)-1) in savelex: if (re.sub(':\d+','',combination[0]) == re.sub(':\d+','',savelex[int(ref_post)-1])): # And if so, delete that item savelex.pop(i) combination.pop(0) continue # Otherwise, we save the current lex in the combination savelex[i] = combination[0] combination.pop(0) lexes_out.append(savelex[i]) return lexes_out def get_valid_combinations(combinations, savelex): '''A helper function to remove combinations of lexes that we know are referentially invalid. This can either be that a lex C refers to a lex B that refers to a lex A, or that a lex refers to another lex that doesn't exist.''' # We need to update this now, since we have joined the strings of # various options valid_combinations = [c for c in combinations] for c in combinations: removed = False mwes = [l for l in c if sep in l and not (l.startswith(sep) or l.startswith('http'))] # If there are no MWEs, the combination is probably valid if not mwes: continue # If a MWE-lex only matches itself, we discard the combination for l in mwes: matches = [l2 for l2 in mwes if re.sub('\.\..+','',l2) == re.sub('\.\..+','',l)] if len(matches) == 1: valid_combinations.remove(c) removed = True break if removed == True: continue # We make a copy of c and savelex so that we can manipulate them tmpc = [l for l in c] tmpsavelex = {k:savelex[k] for k in savelex} # We populate the temporary savelex, replacing all instances of # None with the lexes from the current combination for i in sorted(tmpsavelex): if tmpsavelex[i] == None: tmpsavelex[i] = tmpc[0] tmpc.pop(0) # We go through the fully populated temporary savelex, checking # that all references are valid for i in sorted(tmpsavelex): # If the current MWE lex has a referencing number if (sep in tmpsavelex[i] and ':' in tmpsavelex[i] and not tmpsavelex[i].startswith(sep)): # we get the lex base and its ref number clist = tmpsavelex[i].split(':') ref = clist[-1] lbase = ':'.join(clist[:-1]) lbase = lbase.split('..')[0] try: ref = int(ref) except ValueError: break try: reflex = tmpsavelex[ref-1] except KeyError: valid_combinations.remove(c) break refbase = reflex.split('..')[0] # If whatever the lex refers to has a colon we know # that it is invalid and remove the combination if ':' in reflex: valid_combinations.remove(c) break # If the base of the lex differs from the lex it # refers to, it is also invalid elif not lbase == refbase: valid_combinations.remove(c) break return valid_combinations def comp_lex(lex): '''Compresses a lex or saldo value, removes POS and conj/sense number. Returns the base, plus potential ref numbers. This is in order to compare two lex/saldo values to see if they are similar enough to be joined. ''' # If it ends with a colon, followed by digits if re.match('.+:\d+$',lex): # we set fbase to everything before the colon # and fref to the digits after the colon flist = lex.split(':') fbase = ':'.join(flist[:-1]) fref = flist[-1:][0] # We are reasonably sure that fbase now ends with . followed # by one or more digits. Actually, POS may be different too, and # that should also be OK. Which means we should set the base # to just before the double fullstops flem = lex.split('..')[0] fcomp = flem + fref # If it has no reference number else: fcomp = lex.split('..')[0] return fcomp def join_identicals(lexes): '''A helper function to join lexes we are unable to choose one of. Works only for saldo/lex modes In the case that there are items in the input list that have identical bases, i.e. the only difference is the SALDO POS or number, we remove all but the first one.''' # Trying from scratch newlexes = [] while lexes: # fl = first_of_list, rl = rest_of_list fl, rl = lexes[0], lexes[1:] fcomp = comp_lex(fl) rcomps = [comp_lex(l) for l in rl] if fcomp not in rcomps: newlexes.append(fl) lexes.pop(0) continue else: tojoin = [fl] for l in rl: rcomp = comp_lex(l) if fcomp == rcomp: tojoin.append(l) for l in tojoin: lexes.remove(l) newlexes.append('|'.join(tojoin)) return newlexes def remove_identicals(full_list): '''A helper function to remove lexes we know we don't need. It should work with a list of lexes for a single word or with a list of lists of lexes for a whole sentence. In the case that there are items in the input list that have identical bases, i.e. the only difference is the SALDO POS or number, we remove all but the first one.''' # We make reduced_list a copy of the input list reduced_list = [c for c in full_list] for i, c in enumerate(full_list): # If the items in the list are lists or tuples, we join # them to a string if type(c) == tuple or type(c) == list: c = ' '.join(c) # We remove everything except the base from the lex(es) # ex: slå_fast..vbm.1:11 -> slå_fast cstrip = re.sub('\.{2}.+?\d+:?\d*','',c) # Unless we are at the final item in the list, we compare the base # of the lex(es) to the rest of the list. If the base of an item in the # rest of the list is identical to the base of the current item, we # remove it. if len(full_list) > i: for c2 in full_list[i+1:]: if type(c2) == tuple or type(c) == list: c2 = ' '.join(c2) c2strip = re.sub('\.{2}.+?\d+:?\d*','',c2) if c2strip == cstrip: if c2 in reduced_list: reduced_list.remove(c2) return reduced_list def get_max_mwe_words(combinations): '''A helper function that finds the maximum number of MWE words possible among the combinations and removes all combinations that have fewer MWE words than that number. It does so by counting underscores, which should correspond to MWEs. It should, however, not be used before removing referentially invalid combinations''' max_combinations = [] max_underscores = -1 for c in combinations: underscores = 0 for l in c: if '..' in l and not (l.startswith(sep) or l.startswith('http')): lbase = l.split('..')[0] underscores += (len(lbase.split(sep))-1) if underscores > max_underscores: max_underscores = underscores for c in combinations: underscores = 0 for l in c: if '..' in l and not (l.startswith(sep) or l.startswith('http')): lbase = l.split('..')[0] underscores += (len(lbase.split(sep))-1) if underscores == max_underscores: max_combinations.append(c) return max_combinations def remove_leading_non_mwes(combinations, savelex): '''A helper function to move all lexes preceeding the first MWE from combinations to savelex. The number of lexes to be moved are determined by the combination with the leftmost MWE, thereby ensuring that all combinations keep the same length.''' # In case we already filled savelex and combinations is empty, we # simply return the input if not combinations: return combinations, savelex mincounter = None # We check how many lexes there are before the first MWE word in the # combination with the least lexes before the first MWE word for c in combinations: counter = 0 for l in c: if sep in l and not l.startswith(sep): break else: counter += 1 if not mincounter or counter < mincounter: mincounter = counter # If there are any lexes before the first MWE word, we proceed to # save them and remove them from all combinations if mincounter > 0: reduced_combinations = [] remwords = [] # We use the lexes from the first combination for i in range(mincounter): remwords.append(combinations[0][i]) # Then we remove them from all combinations for c in combinations: reduced_combinations.append(c[mincounter:]) combinations = [c for c in reduced_combinations if c] # Lastly, we add them to savelex savecount = 0 for l in sorted(savelex): if savecount == mincounter: break if savelex[l] == None: savelex[l] = remwords[savecount] savecount += 1 return combinations, savelex def get_best_leftmost(combinations): '''A helper function that removes all combinations where the first MWE is shorter than in other combinations. ''' # TODO: We should also make sure that we only count MWE lexes that # refer to the first MWE lex. As it is now, we would give preference # also to combinations where the first MWE is used multiple times reduced_combinations = [] maxlen = -1 # First we determine the length of the first MWE in the combination # with the longest first MWE for c in combinations: counter = len([l for l in c if re.sub(':\d+','',l) == re.sub(':\d+','',c[0])]) if counter > maxlen: maxlen = counter # Then we remove all combinations where the first MWE is shorter # than the longest first MWE for c in combinations: counter = len([l for l in c if re.sub(':\d+','',l) == re.sub(':\d+','',c[0])]) if counter == maxlen: reduced_combinations.append(c) return reduced_combinations def get_most_compact(combinations): '''A helper function that finds, among all combinations, the smallest distance between the first lex, which should be part of a MWE, and the last lex that it matches. It is currently unused, as it might discard combinations where the first MWE is used multiple times in the same sentence.''' # TODO: We should also make sure that we only count MWE lexes that # refer to the first MWE lex. As it is now, we would give preference # also to combinations where the first MWE is used multiple times reduced_combinations = [] pos_of_last = 0 mindist = None # First we determine the smallest possible distance, among all # combinations, between the first lex and the last lex that it # matches for c in combinations: mwedist = 0 for i, l in enumerate(c): if re.sub(':\d+','',l) == re.sub(':\d+','',c[0]): mwedist = i if mindist == None or mwedist < mindist: mindist = mwedist # Then we remove all combinations where the distance is greater # than that. for c in combinations: mwedist = 0 for i, l in enumerate(c): if re.sub(':\d+','',l) == re.sub(':\d+','',c[0]): mwedist = i if mwedist == mindist: reduced_combinations.append(c) return reduced_combinations def save_first_mwe(combinations, savelex): '''A helper function that moves the first MWE from the first combination to savelex, and then removes the first MWE from all combinations. The function assumes that the first MWE is the same for all combinations.''' # TODO: There might be cases where the first MWE is not the same for # all combinations. # In case we already filled savelex and combinations is empty, we # simply return the input if not combinations: return combinations, savelex reduced_combinations = [] # The first lex should be part of a MWE first_mwe_lex = combinations[0][0] # This list will hold the word positions of all lexes that are part # of the first MWE mwe_positions = [0] # This list will hold the lexes that are part of the first MWE to_be_removed = [first_mwe_lex] ref = 0 # We set the ref variable to the position of the first lex of the # first MWE for i in sorted(savelex): if savelex[i] == None: ref = i+1 break # We go through the first of the combinations. For each lex that has # an identical base to the first one and also a reference number # pointing to the first one, we mark them to be removed, keeping # only the first. for i, l in enumerate(combinations[0][1:]): if re.sub(':\d+','',l) == re.sub(':\d+','',first_mwe_lex) and ':' in l: # The program breaks if a "word" contains a colon. # As a workaround, we just catch the ValueError and # set that word to be removed. try: if int(l.split(':')[-1]) == ref: mwe_positions.append(i+1) to_be_removed.append(l) except ValueError: print("Bad word: %s" % l) to_be_removed.append(l) # We go through all combinations, removing all lexes that are marked # for removal for c in combinations: redc = [] remove = [r for r in to_be_removed] for l in c: if remove: if l == remove[0]: remove.pop(0) else: redc.append(l) else: redc.append(l) reduced_combinations.append(redc) # We then add the removed lexes to savelex counter = 0 for i in sorted(savelex): # The first None in the savelex should correspond to the first # lex of the first MWE if not mwe_positions: break if counter == 0 and savelex[i] == None: savelex[i] = first_mwe_lex mwe_positions.pop(0) counter += 1 # We remove all other lexes that are part of the first MWE from # savelex elif counter == mwe_positions[0] and savelex[i] == None: savelex.pop(i) mwe_positions.pop(0) counter += 1 # Other MWEs are left alone elif savelex[i] == None: counter += 1 return reduced_combinations, savelex def check_mwe(sentence, mode): '''A function to contract Multi-Word Expressions (MWEs) in an XML sentence from a Korp corpus. Returns a list of the contents of each word's LEX tag, with two modifications: 1) If there is no LEX tag, it will be substituted by the word itself, and 2) Each MWE will only be written at the position of the first part of that MWE. The main heuristics are to 1) choose the longest MWEs and 2) choose the leftmost MWE. While this should take care of nearly all cases, we might update the code with a third heuristic to choose the most compact MWE.''' global sep # We set the separator to space if the mode is lemma, otherwise # to underscore if mode == 'lemma': sep = ' ' else: sep = '_' # We make a list of lists of lexes: lexlists = [] for w in sentence: # Sometimes there are other elements than in a sentence # element. Whenever that is the case, we return None if w.tag != 'w': return None # For some reason, the parser have troubles with words # sometimes, setting them to None, which causes errors if w.text == None: w.text = 'noword' # The pipe symbol | is used as a separator in the annotation # lexes is a list of lexes for a given word lexes = [l for l in w.attrib[mode].split('|') if l] # We check how many MWE lexes there are for the current word mwes = [l for l in lexes if sep in l and not l.startswith(sep) and not l.startswith('http')] # If we find any, we remove all duplicates before adding the # list of lexes to the lexlist if len(mwes) > 0: if mode != 'lemma': lexes = join_identicals(lexes) lexlists.append(lexes) # If there are no lexes for the current word, we append the # word instead elif len(lexes) == 0: lexlists.append([w.text]) # If there are lexes for the current word, but no MWE, we # choose the first lex else: lexlists.append(['|'.join(lexes)]) # If there are no MWE's in the sentence, we return a list of the # first lex of each word. We assume that there is an MWE if any lex # has an underscore which is not at the beginning of the string if not [w.attrib[mode] for w in sentence if sep in w.attrib[mode] and not w.attrib[mode].startswith(sep)]: # We flatten the list before returning it lexes_out = [lex for wlexes in lexlists for lex in wlexes] return lexes_out # If we are certain of a single lex, we save it along with its # word's position in the sentence. The position will be needed # later to determine valid references among lexes. Positions # are here counted from zero. savelex = {} # and make a list of those we are unsure of uslist = [] for i, lexes in enumerate(lexlists): mwes = [l for l in lexes if sep in l and not l.startswith(sep) and not l.startswith('http')] # If there is only one lex to choose and it's not part of # a MWE, we choose that # was: if len(lexes) == 1 and '_' not in lexes[0]: if len(lexes) == 1 and (sep not in lexes[0] or lexes[0].startswith(sep) or lexes[0].startswith('http')): savelex[i] = lexes[0] # In case we have an empty word tag and the lexes is empty elif not lexes: savelex[i] = '' # If no MWEs are found we choose the first elif not mwes: savelex[i] = '|'.join(lexes) # Sometimes we have one lex tagged as an MWE and only # that, although it doesn't match other words' lexes. This # probably means that a contracted form of a MWE has been used. # If we find an underscore we store it as None for now, unless # it's of a MWE that is alone, in which case we save it properly else: # We make a flat list of all lexes for all words in the # sentence except for the word we are working on all_except_current = lexlists[:i] + lexlists[i+1:] all_flat = [re.sub(':\d+','',i) for l in all_except_current for i in l] # We check whether the MWE lexes for the current word matches # the lexes for any other words in the sentence identicals = False for l in mwes: if re.sub(':\d+','',l) in all_flat: identicals = True break # If we don't find a match, we save it # Since the first item might be a wrongly added MWE-word, we # choose the first without underscores if that exists if identicals == False: non_mwes = [l for l in lexes if l not in mwes] if non_mwes: savelex[i] = non_mwes[0] else: savelex[i] = lexes[0] # If we find a match, we have a proper MWE that neeeds to be # contracted. We save it as None and add it to the list of # unsure lexes else: savelex[i] = None uslist.append(lexes) # In case our uncertain list is empty, we return our saved items if not uslist: return [savelex[i] for i in sorted(savelex)] # In case we have a big uslist, the number of combinations might # become enormous. We should preferably pre-process the uslist # whenever that happens, but for now, we'll simply set 33 as a # cutoff. If the flat uslist is bigger than that, we choose the # first possible combination. We write the uslist to a log file # to make it easier to handle it properly later. # Instead of flattening the list, we calculate the product # beforehand: comblength = 1 uslengths = [len(l) for l in uslist] for i in uslengths: comblength *= i if comblength > 500000: print("Too many combinations!: %s" % comblength) print("Skipping") with open('flatuslog.txt', 'a+') as logfile: logfile.write("------------------") for l in uslist: lstring = ' '.join(l) logfile.write(lstring + '\n') return None # We generate all combinations of the unsure ones combinations = it.product(*uslist) # We convert the generator into a list combinations = [c for c in combinations] # Then we remove all combinations that are referentially invalid valid_combinations = get_valid_combinations(combinations, savelex) # In some cases, Korp's annotation doesn't provide any valid # combinations. In this case, we choose either the first # combination with an MWE for the first lex, if it exists, or, if # it doesn't, the first combination there is if not valid_combinations: for c in combinations: if sep in c[0] and not c[0].startswith(sep): return combination_to_out(c, savelex) return combination_to_out(combinations[0], savelex) # In case the list of combinations is really large, we delete it del combinations # We remove all combinations that are identical to another # combination except for the SALDO numbers being different, # keeping only the first valid_combinations = remove_identicals(valid_combinations) # At some point I get empty combinations. I need to find out what causes # that. For the time being, we'll just remove them # The following four lines should be removed after one more round of testing valid_combinations = [c for c in valid_combinations if c] unsures = [i for i in sorted(savelex) if savelex[i] == None] if not unsures: return [savelex[i] for i in sorted(savelex)] # Do we have only one valid combination? Choose that! if len(valid_combinations) == 1: return combination_to_out(valid_combinations[0], savelex) # Do we have the option to use all words in one MWE? Choose that! for c in valid_combinations: # We check that the length of the MWE is equal to the number # of lexes in the combination if not c[0].startswith(sep) and len(c[0].split(sep)) == len(c): # We check whether all words in the combination are part of # the same MWE sameas = [l for l in c if re.sub(':\d+','',l) == re.sub(':\d+','',c[0])] if len(sameas) == len(c): return combination_to_out(c, savelex) # We reduce the list of combinations to the one(s) with the most # words used in MWEs max_combinations = get_max_mwe_words(valid_combinations) # We also check whether we are finished and return if we are if len(max_combinations) == 1: return combination_to_out(max_combinations[0], savelex) # Here we start the following process # 1) Save and remove the leading lexes from combinations if # they are not part of a MWE # 2) Remove combinations that are not among those with # the leftmost of the longest MWE # 3) Save the first MWE # Repeat if we have managed to reduce the number of combinations last_length = len(max_combinations) + 1 while len(max_combinations) < last_length: if len(max_combinations) == 1: return combination_to_out(max_combinations[0], savelex) elif len(max_combinations) == 0: return combination_to_out([], savelex) last_length = len(max_combinations) # If all combinations start with lexes that are not part of a # MWE, we move them to savelex. max_combinations, savelex = remove_leading_non_mwes(max_combinations, savelex) # Now we know that the first lex in max_combinations is part of # a MWE. We remove all combinations where the first MWE is # shorter than the longest possible. max_combinations = get_best_leftmost(max_combinations) # Assuming that the first MWE is the same for all combinations, # we move that MWE to savelex max_combinations, savelex = save_first_mwe(max_combinations, savelex) # When we no longer are able to reduce the number of combinations, # we repeat steps 1 and 3 as long as it reduces the first # combination last_length = len(max_combinations[0]) + 1 while max_combinations and len(max_combinations[0]) < last_length: if not max_combinations[0]: return combination_to_out([], savelex) last_length = len(max_combinations[0]) max_combinations, savelex = remove_leading_non_mwes(max_combinations, savelex) max_combinations, savelex = save_first_mwe(max_combinations, savelex) # If at this point we haven't yet found a winning combination, we # choose the first available. It might also be that we have emptied # the list of combinations, in which case we use an empty list. if len(max_combinations) == 0: return combination_to_out([], savelex) elif len(max_combinations) >= 1: return combination_to_out(max_combinations[0], savelex) if __name__ == '__main__': print("You probably want to use bw_extract.py instead of this one")