#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Stian Rødven Eide

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import re
import os
import bz2
import argparse
import contract


def extract_bw(sentence):
    '''This function takes an XML sentence as input and returns either a list
    of the words themselves (if plain mode is used) or a list of either the
    lemma, saldo or lex attributes, depending on mode. If any of the latter,
    only the first attribute is used for any given word, and if missing, it
    is substituted by the word.'''
    if mode == 'lemma':
        sep = ' '
    else:
        sep = '_'
    sentlist = []
    for w in sentence:
        if mode == 'plain':
            if w.text:
                sentlist.append(w.text)
            else:
                sentlist.append('noword')
        else:
            lexes = [l for l in w.attrib[mode].split('|')
                     if (l and not sep in l)]
            if lexes:
                sentlist.append('|'.join(lexes))
            elif w.text:
                sentlist.append(w.text)
            else:
                sentlist.append('noword')
    return sentlist


def process_dir(directory):
    '''This function traverses a directory and processes any bzipped
    xml files it finds. If the MWE option is used, it calls the check_mwe
    function in contract.py for each sentence element. If the MWE option is
    not used, the extract_bw function above is called instead.

    For each sentence, a string is written to outfile, formatted depending on
    whether plain, lemma, saldo or lex mode is used.'''
    global mode
    global mwe
    global genre
    global outfile
    global first
    for root, dirs, files in os.walk(directory):
        os.chdir(root)
        folder_content = os.listdir(root)
        xmldocs = [f for f in folder_content if f.endswith('.xml.bz2')]
        if xmldocs:
            for doc in xmldocs:
                noparse = False
                print("Processing {dir}/{file}".format(dir=root,file=doc))
                infile = bz2.BZ2File(doc, 'rb')
                xmldata = ET.iterparse(infile, events=['start', 'end'])
                _, xroot = next(xmldata)
                for event, element in xmldata:
                    if genre != 'all':
                        if element.tag == 'text' and event == 'end':
                            noparse = False
                        if element.tag == 'text' and event == 'start':
                            if 'genre' in element.attrib:
                                if element.attrib['genre'] != genre:
                                    noparse = True
                                    continue
                    if noparse == True:
                        continue
                    if element.tag == 'sentence' and event == 'end':
                        if mwe:
                            sent = contract.check_mwe(element, mode)
                        else:
                            sent = extract_bw(element)
                        if sent == None:
                            continue
                        if first:
                            sent = [s.split('|')[0] for s in sent if s]
                        else:
                            sent = [s for s in sent if s]
                        outstring = ' '.join(sent)
                        with open(outfile, "a+") as f:
                            f.write(outstring + '\n')
                    xroot.clear()


if __name__ == '__main__':
    directory = os.getcwd()
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=['plain', 'lemma', 'saldo', 'lex'],
                        default="plain")
    parser.add_argument('--mwe', action="store_true", default=False)
    parser.add_argument('--first-only', action="store_true", default=False)
    parser.add_argument('--genre', choices=['fiction', 'government', 'news',
                        'science', 'socialmedia', 'all'], default="all")
    parser.add_argument('outfile')
    args = parser.parse_args()
    mode = args.mode
    mwe = args.mwe
    genre = args.genre
    first = args.first_only
    outfile = directory + '/' + args.outfile
    if mode == 'plain' and mwe == True:
        print('Multi-Word Expressions are not available for plain mode\n')
        parser.print_help()
    elif mode == 'plain' and first == True:
        print('First-only is not available for plain mode\n')
        parser.print_help()
    else:
        process_dir(directory)