FS-TFP/federatedscope/nlp/metric/bleu/bleu_scorer.py

276 lines
8.1 KiB
Python

"""
The implementations are adapted from https://github.com/tylin/coco-caption/
blob/master/pycocoevalcap/bleu/bleu_scorer.py
"""
import copy
import math
from collections import defaultdict
def precook(s, n=4, out=False):
"""
Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well.
"""
words = s.split()
counts = defaultdict(int)
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i:i + k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4):
"""
Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.
"""
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram, count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen)) / len(reflen)
return (reflen, maxcounts)
def cook_test(test, refs, eff=None, n=4):
"""
Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.
"""
reflen, refmaxcounts = refs
testlen, counts = precook(test, n, True)
result = {}
# Calculate effective reference sentence length.
if eff == "closest":
result["reflen"] = min((abs(rl - testlen), rl) for rl in reflen)[1]
else: # i.e., "average" or "shortest" or None
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
result['correct'] = [0] * n
for (ngram, count) in counts.items():
result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0),
count)
return result
class BleuScorer(object):
"""
Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", \
"_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for
# a node).
def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
"""
called by constructor and __iadd__ to avoid creating new
instances.
"""
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
self.ctest.append(cooked_test) # N.B.: -1
else:
self.ctest.append(
None) # lens of crefs and ctest have to match
self._score = None # need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), \
"refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
# avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None # need to recompute
return self
def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens)) / len(reflens)
elif option == "closest":
reflen = min((abs(rl - testlen), rl) for rl in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 # so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {
'testlen': 0,
'reflen': 0,
'guess': [0] * n,
'correct': [0] * n
}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: # need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess', 'correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= \
(float(comps['correct'][k]) + tiny) / (
float(comps['guess'][k]) + small)
bleu_list[k].append(bleu**(1. / (k + 1)))
ratio = (testlen + tiny) / (reflen + small) # N.B.: avoid zero
# division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
if verbose > 1:
print(comps, reflen)
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in range(n):
bleu *= \
float(totalcomps['correct'][k] + tiny) / (
totalcomps['guess'][k] + small)
bleus.append(bleu**(1. / (k + 1)))
ratio = (self._testlen + tiny) / (self._reflen + small) # N.B.:
# avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1 / ratio)
if verbose > 0:
print(totalcomps)
print("ratio:", ratio)
self._score = bleus
return self._score, bleu_list