# -*- coding: utf-8 -*- """ Tests for BLEU translation evaluation metric """ import functools import io import unittest from nltk.data import find from nltk.translate.bleu_score import modified_precision, brevity_penalty, closest_ref_length from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction class TestBLEU(unittest.TestCase): def test_modified_precision(self): """ Examples from the original BLEU paper http://www.aclweb.org/anthology/P02-1040.pdf """ # Example 1: the "the*" example. # Reference sentences. ref1 = 'the cat is on the mat'.split() ref2 = 'there is a cat on the mat'.split() # Hypothesis sentence(s). hyp1 = 'the the the the the the the'.split() references = [ref1, ref2] # Testing modified unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) assert (round(hyp1_unigram_precision, 4) == 0.2857) # With assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4) # Testing modified bigram precision. assert(float(modified_precision(references, hyp1, n=2)) == 0.0) # Example 2: the "of the" example. # Reference sentences ref1 = str('It is a guide to action that ensures that the military ' 'will forever heed Party commands').split() ref2 = str('It is the guiding principle which guarantees the military ' 'forces always being under the command of the Party').split() ref3 = str('It is the practical guide for the army always to heed ' 'the directions of the party').split() # Hypothesis sentence(s). hyp1 = 'of the'.split() references = [ref1, ref2, ref3] # Testing modified unigram precision. assert (float(modified_precision(references, hyp1, n=1)) == 1.0) # Testing modified bigram precision. assert(float(modified_precision(references, hyp1, n=2)) == 1.0) # Example 3: Proper MT outputs. hyp1 = str('It is a guide to action which ensures that the military ' 'always obeys the commands of the party').split() hyp2 = str('It is to insure the troops forever hearing the activity ' 'guidebook that party direct').split() references = [ref1, ref2, ref3] # Unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1)) # Test unigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4) self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4) # Test unigram precision with rounding. assert (round(hyp1_unigram_precision, 4) == 0.9444) assert (round(hyp2_unigram_precision, 4) == 0.5714) # Bigram precision hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2)) hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2)) # Test bigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4) self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4) # Test bigram precision with rounding. assert (round(hyp1_bigram_precision, 4) == 0.5882) assert (round(hyp2_bigram_precision, 4) == 0.0769) def test_brevity_penalty(self): # Test case from brevity_penalty_closest function in mteval-v13a.pl. # Same test cases as in the doctest in nltk.translate.bleu_score.py references = [['a'] * 11, ['a'] * 8] hypothesis = ['a'] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4) references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] hypothesis = ['a'] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) assert brevity_penalty(closest_ref_len, hyp_len) == 1.0 def test_zero_matches(self): # Test case where there's 0 matches references = ['The candidate has no alignment to any of the references'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1,len(hypothesis)): weights = [1.0/n] * n # Uniform weights. assert(sentence_bleu(references, hypothesis, weights) == 0) def test_full_matches(self): # Test case where there's 100% matches references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1,len(hypothesis)): weights = [1.0/n] * n # Uniform weights. assert(sentence_bleu(references, hypothesis, weights) == 1.0) def test_partial_matches_hypothesis_longer_than_reference(self): references = ['John loves Mary'.split()] hypothesis = 'John loves Mary who loves Mike'.split() self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4) # Checks that the warning has been raised because len(reference) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. #@unittest.skip("Skipping fringe cases for BLEU.") class TestBLEUFringeCases(unittest.TestCase): def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). references = ['John loves Mary ?'.split()] hypothesis = 'John loves Mary'.split() n = len(hypothesis) + 1 # weights = [1.0/n] * n # Uniform weights. self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4) # Checks that the warning has been raised because len(hypothesis) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # Test case where n > len(hypothesis) but so is n > len(reference), and # it's a special case where reference == hypothesis. references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() assert(sentence_bleu(references, hypothesis, weights) == 1.0) def test_empty_hypothesis(self): # Test case where there's hypothesis is empty. references = ['The candidate has no alignment to any of the references'.split()] hypothesis = [] assert(sentence_bleu(references, hypothesis) == 0) def test_empty_references(self): # Test case where there's reference is empty. references = [[]] hypothesis = 'John loves Mary'.split() assert(sentence_bleu(references, hypothesis) == 0) def test_empty_references_and_hypothesis(self): # Test case where both references and hypothesis is empty. references = [[]] hypothesis = [] assert(sentence_bleu(references, hypothesis) == 0) def test_reference_or_hypothesis_shorter_than_fourgrams(self): # Tese case where the length of reference or hypothesis # is shorter than 4. references = ['let it go'.split()] hypothesis = 'let go it'.split() # Checks that the value the hypothesis and reference returns is 1.0 assert(sentence_bleu(references, hypothesis) == 1.0) # Checks that the warning has been raised. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. class TestBLEUvsMteval13a(unittest.TestCase): def test_corpus_bleu(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i, smoothing_function=chencherry.method3) assert abs(mteval_bleu - nltk_bleu) < 0.005 class TestEmulateMultiBLEU(unittest.TestCase): def test_corpus_bleu_with_emulate_multibleu(self): hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" ref = str("Their tasks include changing a pump on the faulty stokehold ." "Likewise , two species that are very similar in morphology " "were distinguished using genetics .") references = [[ref.split()]] hypothese = [hyp.split()] try: # Check that the warning is raised since no. of 2-grams < 0. with self.assertWarns(UserWarning): # Verify that the BLEU output is undesired since no. of 2-grams < 0. self.assertAlmostEqual(corpus_bleu(references, hypothese), 0.4309, places=4) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. desired_output = corpus_bleu(references, hypothese, emulate_multibleu=True) #assert assert desired_output == 0.0