#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import cjson
import datetime
import sys
#saldo_file="/home/markus/sblex_update/saldo/saldo.txt"
#saldo_morpho_file="/home/markus/sblex_update/saldo/saldo.lex"
saldo_file="saldo.txt"
saldo_morpho_file="saldo.lex"
(saldo,saldom,saldo_sem)=([],[],set([]))
def fathers(d):
if d[2] == 'PRIM..1':
return []
return d[2].split(' ')
def number_of_entries():
return len(saldo)
def number_of_senses():
return len(set([d[0] for d in saldo_sem]))
def descriptor_count(d):
if d[1] == 'PRIM..1':
return 0
fs = fathers(d)
return 1+len(fs)
def number_of_mothers():
return len(set([d[1] for d in saldo_sem]))
def number_of_fathers():
return len(set([f for d in saldo_sem for f in d[2].split(' ')]))
def number_of_lemgrams():
return len(set([d[3] for d in saldo]))
def number_of_gf():
return len(set([d[4] for d in saldo]))
def number_of_pos():
return len(set([d[5] for d in saldo]))
def number_of_paradigms():
return len(set([d[6] for d in saldo]))
def number_of_wordforms():
return len(saldom)
def pref(pos,lang):
if lang == 'swe':
return '%s' % (pos,pos)
else:
return '%s' % (pos,pos)
def pos_table(lang):
pos = [d[5] for d in saldo]
table = sorted([(pos.count(p),p) for p in set(pos)],reverse=True)
return "\n".join(['
',
"\n".join(['
%s
' % (pref(p,lang)) for (_,p) in table[:15]]),
"
",
"\n".join(['
%s
' % (pr_number(n,lang)) for (n,_) in table[:15]]),
"
",
'
',
"\n".join(['
%s
' % (pref(p,lang)) for (_,p) in table[15:30]]),
"
",
"\n".join(['
%s
' % (pr_number(n,lang)) for (n,_) in table[15:30]]),
"
",
'
',
"\n".join(['
%s
' % (pref(p,lang)) for (_,p) in table[30:]]),
"%s
" %("".join(['
' for _ in range(14-len(table[30:]))])),
"\n".join(['
%s
' % (pr_number(n,lang)) for (n,_) in table[30:]]),
"%s
" %("".join(['
' for _ in range(14-len(table[30:]))])),
"
",
'
'
]).encode('UTF-8')
def number_of_descriptors(lang):
count = {}
for d in saldo_sem:
c = descriptor_count(d)
if c in count:
count[c] += 1
else:
count[c] = 1
res = sorted(count.items())
return "\n".join(['
',
'
%s
' % (locale['desc_count'][lang]),
"".join(['
%s
' % (pr_number(n,lang)) for (n,_) in res]),
"
",
'
%s
' % (locale['senses'][lang]),
"".join(['
%s
' % (pr_number(n,lang)) for (_,n) in res]),
'
'
])
def pr_number(n,lang):
n_str = str(n)
res = ''
if lang == 'swe':
dot = '.'
else:
dot = ','
count = 1
for x in reversed(n_str):
res = x + res
if (count % 3 == 0 and count != len(n_str)):
res = dot + res
count += 1
return res
def read_data():
#count = 0
for l in codecs.open(saldo_file, encoding='UTF-8'):
lt = l[:-1].split('\t')
if len(lt) == 7:
saldo.append(lt)
saldo_sem.add((lt[0],lt[1],lt[2]))
# count += 1
# if count > 10000:
# break
for l in codecs.open(saldo_morpho_file, encoding='UTF-8'):
try:
saldom.append(cjson.decode(l))
except:
pass
# count += 1
# if count > 20000:
# break
locale = {
'paradigm_list':{'swe':'paradigmlista','eng':'paradigm list'},
'compact_paradigm_list':{'swe':'kompakt paradigmlista','eng':'compact paradigm list'},
'paradigm_distribution':{'swe':'paradigmdistribution','eng':'paradigm distribution'},
'entries':{'swe':'ingångar','eng':'entries'},
'wordforms':{'swe':'ordformer','eng':'wordforms'},
'senses':{'swe':'betydelser','eng':'senses'},
'primaries':{'swe':'primärer','eng':'primaries'},
'secondaries':{'swe':'sekundärer','eng':'secondaries'},
'lemgrams':{'swe':'lemgram','eng':'lemgrams'},
'lemmas':{'swe':'grundformer','eng':'lemmas'},
'paradigms':{'swe':'paradigm','eng':'paradigms'},
'word_classes':{'swe':'ordklasser','eng':'word classes'},
'desc_count':{'swe':'antal deskriptorer','eng':'descriptor count'}
}
def produce_report(lang):
tdate = str(datetime.date.today())
date_and_time = str(datetime.datetime.now()).split('.')[0]
return "\n".join([
"",
" ",
' ',
' ',
" Statistik: SALDO (%s)" % (date_and_time),
" ",
" ",
"