#!/usr/bin/env perl ########################################################################## # Extracts paradigm from newline-separated list of word forms # # using a strategy of extracting the longest common subsequence # # shared by all the forms in the paradigm. # # # # Needs (1) foma installed, (2) extract.foma in the current directory # # # # Options: -p print full newline-separated forms instead of compact form # # -a print all possible paradigms that fit the MLCS pattern # # -u collapse similar paradigms # # -d print debug info to stderr # # # # Example input: # # ring # # rang # # rung # # rings # # ringing # # # # Example output: # # 1+"i"+2#1+"a"+2#1+"u"+2#1+"i"+2+"s"#1+"i"+2+"ing" 0=ring,1=r,2=ng # # # # with the -p flag (useful for debugging) the output is: # # r+"i"+ng # # r+"a"+ng # # r+"u"+ng # # r+"i"+ng+"s" # # # # Several patterns may be sent (newline-separated), which produces an # # output line for each pattern. # # The -u flag collapses identical paradigms; however, the variable parts # # are retained. For example, if the input is: # # # # trång # # trängre # # # # lång # # längre # # # # The output with the -u flag will be: # # 1+"å"+2#1+"ä"+2+"re" 0=trång,1=tr,2=ng#0=lång,1=l,2=ng # # # # MH20130923 # ########################################################################## #use open qw/:std :utf8/; use FileHandle; use IPC::Open2; use Getopt::Std; use File::Basename; use Cwd 'abs_path'; use Encode; my $dirname = dirname(abs_path($0)); getopts("apud", \%opts); if (defined($opts{a})) { $PRINT_ALL = 1; } else { $PRINT_ALL = 0; } if (defined($opts{p})) { $PRINT_PARADIGM = 1; } else { $PRINT_PARADIGM = 0; } if (defined($opts{u})) { $COLLAPSE_PARADIGMS = 1; } else { $COLLAPSE_PARADIGMS = 0; } if (defined($opts{d})) { $DEBUG = 1; } else { $DEBUG = 0; } while () { chomp; if ($_ eq "") { if (@wordforms) { push @inputpatterns, join('#', @wordforms); # create array of patterns, each entry in pattern #-separated } undef @wordforms; next; } ($wordform, $dummy) = split /\t/ ; if ($wordform =~ /,?/) { @wfsalt = split /(?<=\,)/m, $wordform ; push @wordforms, @wfsalt; } else { push @wordforms, $wordform; } } if (@wordforms) { push @inputpatterns, join('#', @wordforms); # array of patterns } foreach (@inputpatterns) { @wordforms = split '#'; dprint("Analyzing $wordforms[0]\n"); $lcp = longest_common_prefix(@wordforms); # We cut off the LCP to speed up foma's extraction of the sequences # but add it back after the pattern is extracted my @subs = (); foreach $i (0 .. $#wordforms) { if ($lcp ne "") { $wordforms[$i] =~ s/^$lcp/|/; # Replace LCP with | } push @subs, "Subsequence({" .$wordforms[$i] ."})"; } $pidMORPH = open2($Reader, $Writer, "/usr/bin/env foma -p -l $dirname/extract.foma"); print $Writer "define MLCS Longest( " .join('&', @subs) ." ) ;\n" ."define WordSeq {" .join('#', @wordforms) ."};\n" ."define RepeatedPatterns AddExtra([MarkRoot(MLCS) [%# MarkRoot(MLCS)]*]);\n" ."define RepeatedPatternsEQ AddExtra(RedupN(MarkRoot(MLCS), %#));\n" ."define BracketedWordSeq RandomBracketing(WordSeq);\n" ."regex {PATTERN:} Markup(Filter2(Filter1(RepeatedPatterns & BracketedWordSeq)) .o. RepeatedPatternsEQ);\n" ."words\n" ."quit\n"; close($Writer); my @patterns = (); $has_analysis = 1; while (<$Reader>) { if ($_ =~ /0 paths/) { $has_analysis = 0; last; } if ($_ =~ /^PATTERN:/) { s/^PATTERN://g; chomp; push(@patterns, $_); } } close($Reader); waitpid ($pidMORPH, 0); # If we didn't get an analysis with the first (quick) method # where we don't use RepeatedPatternsEQ, we do the more thorough # but slower filtering, where we use RepeatedPatternsEQ from the beginning if ($has_analysis == 0) { $pidMORPH = open2($Reader, $Writer, "/usr/bin/env foma -p -l $dirname/extract.foma"); print $Writer "define MLCS Longest( " .join('&', @subs) ." ) ;\n" ."define WordSeq {" .join('#', @wordforms) ."};\n" ."define RepeatedPatternsEQ AddExtra(RedupN(MarkRoot(MLCS), %#));\n" ."define BracketedWordSeq RandomBracketing(WordSeq);\n" ."regex {PATTERN:} Markup(Filter2(Filter1(RepeatedPatternsEQ & BracketedWordSeq)));\n" ."words\n" ."quit\n"; close($Writer); @patterns = (); while (<$Reader>) { if ($_ =~ /^PATTERN:/) { s/^PATTERN://g; chomp; push(@patterns, $_); } } close($Reader); waitpid ($pidMORPH, 0); } ### $mincost = 99999; for $i (0 .. $#patterns) { $cost = 0; @words = split '#', $patterns[$i]; foreach (@words) { s/^\"[^"]+\"//g; # cut beginning s/\+\"[^"]+\"$//g; # cut end s/\"([^"]+)\"/"|" x length($1)/eg; # replace each quoted sym with auxiliary | s/[^|]//g; # remove everything else $cost += length($_); # get "gappiness" count for string } @patterncosts[$i] = $cost; if ($cost < $mincost) { $mincost = $cost; } # store minimum cost to use for filtering } $printouts = 0; for $i (0 .. $#patterns) { if ($lcp ne "") { $patterns[$i] =~ s/^\|/$lcp/; # Put back LCP $patterns[$i] =~ s/#\|/#$lcp/g; } $patternstring = $patterns[$i]; if ($patterncosts[$i] <= $mincost or $PRINT_ALL == 1) { $printouts++; if ($printouts > 1 and $PRINT_ALL == 1) { print STDERR "***WARNING: Pattern is ambiguous***\n"; } if ($PRINT_PARADIGM == 0) { $vstr = paradigm_get_variables($patternstring); $pstr = paradigm_generalize($patternstring); if ($printouts < 2 or $PRINT_ALL == 1) { push @outpatterns, $pstr ."\t" .$vstr; } } else { if ($printouts < 2 or $PRINT_ALL == 1) { $patterns[$i] =~ s/\+\",\"\#/,/g; $patterns[$i] =~ s/,\"\#/",/g; @words = split '#', $patterns[$i]; print join("\n", @words) ."\n\n"; } } } } } # Join alternate forms map {s/\+\",\"\#/,/g; } @outpatterns; map {s/,\"\#/",/g; } @outpatterns; if ($PRINT_PARADIGM == 0) { if ($COLLAPSE_PARADIGMS == 1) { print join ("\n", paradigm_uniq(@outpatterns)) ."\n"; } else { print join("\n", @outpatterns) ."\n"; } } sub paradigm_uniq { # Collapses identical paradigms # Pass array of paradigms and variables, and return combined paradigms # Example input: # 1#1+"are"#1+"ast" (TAB) 0=fort,1=fort <= [array entry 1] # 1#1+"are"#1+"ast" (TAB) 0=tokig,1=tokig <= [array entry 2] # Output: # 1#1+"are"#1+"ast" (TAB) 0=fort,1=fort#0=tokig,1=tokig [array entry 1] my @p = @_; my @uniqp = (); foreach (@p) { (my $lhs, my $rhs) = split /\t/; my $found = 0; for my $i (0 .. $#uniqp) { (my $lhsuniq, my $rhsuniq) = split /\t/, $uniqp[$i]; if ($lhs eq $lhsuniq) { $uniqp[$i] = $lhsuniq ."\t" .$rhsuniq ."#" .$rhs; $found = 1; last; } } if ($found == 0) { push @uniqp, $lhs ."\t" .$rhs; } } return(@uniqp); } sub paradigm_get_variables { my $ptrn = shift; my @forms = split '#', $ptrn; my $varcnt = 1; $firstform = $forms[0]; $fullform = $firstform; $fullform =~ s/,//g; $fullform =~ s/["+]+//g; $firstform = "+" .$firstform ."+"; $firstform =~ s/\+([^+"]+)(?=\+)/"," .$varcnt++ ."=" .$1/ge; $firstform =~ s/\"[^"]+\"//g; $firstform =~ s/\+//g; my $varline = "0=$fullform" .$firstform; return($varline); } sub paradigm_generalize { # Convert a pattern of a specific type to a general one with variables, e.g. # ka+"t"+to#ka+to+"t"#ka+to+"n"#ka+"t"+to+"jen"#... => # 1+"t"+2#1+2+"t"#1+2+"n"#1+"t"+2+"jen"#... # This allows for easier comparison of paradigms my $ptrn = shift; my @forms = split '#', $ptrn; my @newforms; foreach (@forms) { my $varcnt = 1; my $pstring = "+" .$_ ."+"; $pstring =~ s/\+([^+"]+)(?=\+)/"+" .$varcnt++/ge; $pstring =~ s/^\+//g; $pstring =~ s/\+$//g; push @newforms, $pstring; } $genforms = join ('#', @newforms); return $genforms; } # Returns the longest common prefix of an array of strings sub longest_common_prefix { my $p = shift; for (@_) { chop $p while (! /^\Q$p\E/); } # Make sure we don't have any partial utf8 hanging at the end thanks to perl $p =~ s/^(([\001-\177]|[\300-\337].|[\340-\357]..|[\360-\367]...)+).*/\1/; return $p; } # Debug printing sub dprint { my $arg = shift; if ($DEBUG == 1) { print STDERR $arg; } }