''' Created on Jan 17, 2017 @author: David ''' from tree_representation import TreeRepresentation,TreeNode from xml.etree import ElementTree as ET def build_trees(response): """ Takes an XML string from Sparv and returns three TreeRepresentations: word form tree, part-of-speech tree and dependency relations tree """ height = 0 # Parse result into Element Tree result = ET.fromstring(response) # Get corpus element from tree corpus = result[1] # Get root root = corpus.findall(".//w[@deprel='ROOT']") # TODO handle error if no root found or more than one root found if len(root) == 0: raise AttributeError("No root found!") if len(root) > 1: raise AttributeError("Multiple roots found!") root = root[0] # Take first (and only) root def depheadsort(node): """ Sort nodes by dephead if dephead >= root id, sorts all other nodes by dephead behind """ dephead = node.attrib["dephead"] i_dep = int(dephead) i_root = int(root_id) if (i_dep >= i_root): return i_dep return 100+i_dep # or another big number # Common attributes root_id = root.attrib["ref"] root_content = root.text root_pos = root.attrib["pos"] root_dep = root.attrib["deprel"] # Word tree word_tree_root = TreeNode() word_tree = TreeRepresentation() word_tree_root.setId(root_id) word_tree_root.setContent(root_content) word_tree.setRoot(word_tree_root) nodes = corpus.findall(".//w[@dephead]") # sort nodes in ascending order by dependency head with nodes where dephead >= root id first, then all other nodes nodes = sorted(nodes, key=depheadsort) # Pos tree pos_tree_root = TreeNode() pos_tree = TreeRepresentation() pos_tree_root.setId(root_id) pos_tree_root.setContent(root_pos) pos_tree.setRoot(pos_tree_root) # Dep tree dep_tree_root = TreeNode() dep_tree = TreeRepresentation() dep_tree_root.setId(root_id) dep_tree_root.setContent(root_dep) dep_tree.setRoot(dep_tree_root) prev_head = -1 for node in nodes: head = node.attrib["dephead"] c_head = int(head) if (c_head > prev_head): height+=1 prev_head = c_head content = node.text pos = node.attrib["pos"] dep = node.attrib["deprel"] nid = node.attrib["ref"] #print("{} {} under {}".format(content,nid,head)) w_ref_node = word_tree.getNodeById(head) p_ref_node = pos_tree.getNodeById(head) d_ref_node = dep_tree.getNodeById(head) if (len(w_ref_node)==0): raise Exception("No reference node found") else: w_ref_node = w_ref_node[0] p_ref_node = p_ref_node[0] d_ref_node = d_ref_node[0] w_child = TreeNode(content,nid,[]) p_child = TreeNode(pos,nid,[]) d_child = TreeNode(dep,nid,[]) w_ref_node.addChild(w_child) word_tree.addNode(w_child) p_ref_node.addChild(p_child) pos_tree.addNode(p_child) d_ref_node.addChild(d_child) dep_tree.addNode(d_child) word_tree.height = height pos_tree.height = height dep_tree.height = height return (word_tree,pos_tree,dep_tree)