Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python3 support #3

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ Evaluation codes for MS COCO caption generation.
- meteor: Meteor evaluation codes
- rouge: Rouge-L evaluation codes
- cider: CIDEr evaluation codes
- spice: SPICE evaluation codes

## Setup ##

- You will first need to download the [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models for use by SPICE. To do this, run:
./get_stanford_models.sh

## References ##

Expand All @@ -41,7 +35,6 @@ Evaluation codes for MS COCO caption generation.
- Meteor: [Project page](http://www.cs.cmu.edu/~alavie/METEOR/) with related publications. We use the latest version (1.5) of the [Code](https://github.com/mjdenkowski/meteor). Changes have been made to the source code to properly aggreate the statistics for the entire corpus.
- Rouge-L: [ROUGE: A Package for Automatic Evaluation of Summaries](http://anthology.aclweb.org/W/W04/W04-1013.pdf)
- CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf)
- SPICE: [SPICE: Semantic Propositional Image Caption Evaluation] (http://panderson.me/images/SPICE.pdf)

## Developers ##
- Xinlei Chen (CMU)
Expand Down
7,998 changes: 38 additions & 7,960 deletions cocoEvalCapDemo.ipynb

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions get_stanford_models.sh
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env sh
# This script downloads the Stanford CoreNLP models.
s script downloads the Stanford CoreNLP models.

CORENLP=stanford-corenlp-full-2015-12-09
SPICELIB=pycocoevalcap/spice/lib
Expand Down
Empty file modified pycocoevalcap/__init__.py
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/bleu/LICENSE
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/bleu/__init__.py
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion pycocoevalcap/bleu/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>

from bleu_scorer import BleuScorer
from .bleu_scorer import BleuScorer


class Bleu:
Expand Down
32 changes: 16 additions & 16 deletions pycocoevalcap/bleu/bleu_scorer.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def precook(s, n=4, out=False):
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
Expand All @@ -42,7 +42,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.iteritems():
for (ngram,count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

# Calculate effective reference sentence length.
Expand All @@ -57,7 +57,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"

return (reflen, maxcounts)

def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
def cook_test(test, reflen, refmaxcounts, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''

Expand All @@ -74,10 +74,10 @@ def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):

result["testlen"] = testlen

result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]

result['correct'] = [0]*n
for (ngram, count) in counts.iteritems():
for (ngram, count) in counts.items():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)

return result
Expand Down Expand Up @@ -112,7 +112,7 @@ def cook_append(self, test, refs):
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
cooked_test = cook_test(test, *self.crefs[-1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
Expand Down Expand Up @@ -144,7 +144,7 @@ def retest(self, new_test):
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self.ctest.append(cook_test(t, *rs))
self._score = None

return self
Expand Down Expand Up @@ -224,40 +224,40 @@ def compute_score(self, option=None, verbose=0):
self._reflen += reflen

for key in ['guess','correct']:
for k in xrange(n):
for k in range(n):
totalcomps[key][k] += comps[key][k]

# append per image bleu score
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)

if verbose > 1:
print comps, reflen
print(comps, reflen)

totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen

bleus = []
bleu = 1.
for k in xrange(n):
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)

if verbose > 0:
print totalcomps
print "ratio:", ratio
print(totalcomps)
print("ratio:%f"%ratio)

self._score = bleus
return self._score, bleu_list
Empty file modified pycocoevalcap/cider/__init__.py
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion pycocoevalcap/cider/cider.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# Authors: Ramakrishna Vedantam <[email protected]> and Tsung-Yi Lin <[email protected]>

from cider_scorer import CiderScorer
from .cider_scorer import CiderScorer
import pdb

class Cider:
Expand Down
10 changes: 5 additions & 5 deletions pycocoevalcap/cider/cider_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def precook(s, n=4, out=False):
"""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
Expand Down Expand Up @@ -99,7 +99,7 @@ def compute_doc_freq(self):
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

Expand All @@ -115,7 +115,7 @@ def counts2vec(cnts):
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram,term_freq) in cnts.iteritems():
for (ngram,term_freq) in cnts.items():
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
Expand Down Expand Up @@ -146,7 +146,7 @@ def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram,count) in vec_hyp[n].iteritems():
for (ngram,count) in vec_hyp[n].items():
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]

Expand Down
22 changes: 11 additions & 11 deletions pycocoevalcap/eval.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
__author__ = 'tylin'
from tokenizer.ptbtokenizer import PTBTokenizer
from bleu.bleu import Bleu
from meteor.meteor import Meteor
from rouge.rouge import Rouge
from cider.cider import Cider
from spice.spice import Spice
from .tokenizer.ptbtokenizer import PTBTokenizer
from .bleu.bleu import Bleu
from .meteor.meteor import Meteor
from .rouge.rouge import Rouge
from .cider.cider import Cider
from .spice.spice import Spice

class COCOEvalCap:
def __init__(self, coco, cocoRes):
Expand All @@ -27,15 +27,15 @@ def evaluate(self):
# =================================================
# Set up scorers
# =================================================
print 'tokenization...'
print('tokenization...')
tokenizer = PTBTokenizer()
gts = tokenizer.tokenize(gts)
res = tokenizer.tokenize(res)

# =================================================
# Set up scorers
# =================================================
print 'setting up scorers...'
print('setting up scorers...')
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(),"METEOR"),
Expand All @@ -48,17 +48,17 @@ def evaluate(self):
# Compute scores
# =================================================
for scorer, method in scorers:
print 'computing %s score...'%(scorer.method())
print('computing %s score...'%(scorer.method()))
score, scores = scorer.compute_score(gts, res)
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
self.setEval(sc, m)
self.setImgToEvalImgs(scs, gts.keys(), m)
print "%s: %0.3f"%(m, sc)
print("%s: %0.3f"%(m, sc))
else:
self.setEval(score, method)
self.setImgToEvalImgs(scores, gts.keys(), method)
print "%s: %0.3f"%(method, score)
print("%s: %0.3f"%(method, score))
self.setEvalImgs()

def setEval(self, score, method):
Expand Down
Empty file modified pycocoevalcap/meteor/__init__.py
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/meteor/data/paraphrase-en.gz
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/meteor/meteor-1.5.jar
100755 → 100644
Empty file.
24 changes: 14 additions & 10 deletions pycocoevalcap/meteor/meteor.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ def compute_score(self, gts, res):
stat = self._stat(res[i][0], gts[i])
eval_line += ' ||| {}'.format(stat)

self.meteor_p.stdin.write('{}\n'.format(eval_line))
for i in range(0,len(imgIds)):
scores.append(float(self.meteor_p.stdout.readline().strip()))
score = float(self.meteor_p.stdout.readline().strip())
self.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
self.meteor_p.stdin.flush()
for i in range(0, len(imgIds)):
scores.append(float(self.meteor_p.stdout.readline().decode().strip()))
score = float(self.meteor_p.stdout.readline().decode().strip())
self.lock.release()

return score, scores
Expand All @@ -52,20 +53,23 @@ def _stat(self, hypothesis_str, reference_list):
# SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
self.meteor_p.stdin.write('{}\n'.format(score_line))
return self.meteor_p.stdout.readline().strip()
self.meteor_p.stdin.write('{}\n'.format(score_line).encode())
self.meteor_p.stdin.flush()
return self.meteor_p.stdout.readline().decode().strip()

def _score(self, hypothesis_str, reference_list):
self.lock.acquire()
# SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
self.meteor_p.stdin.write('{}\n'.format(score_line))
stats = self.meteor_p.stdout.readline().strip()
self.meteor_p.stdin.write('{}\n'.format(score_line).encode())
self.meteor_p.stdin.flush()
stats = self.meteor_p.stdout.readline().decode().strip()
eval_line = 'EVAL ||| {}'.format(stats)
# EVAL ||| stats
self.meteor_p.stdin.write('{}\n'.format(eval_line))
score = float(self.meteor_p.stdout.readline().strip())
self.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
self.meteor_p.stdin.flush()
score = float(self.meteor_p.stdout.readline().decode().strip())
# bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
# thanks for Andrej for pointing this out
score = float(self.meteor_p.stdout.readline().strip())
Expand Down
Empty file modified pycocoevalcap/rouge/__init__.py
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/rouge/rouge.py
100755 → 100644
Empty file.
Empty file modified pycocoevalcap/spice/__init__.py
100755 → 100644
Empty file.
6 changes: 3 additions & 3 deletions pycocoevalcap/spice/spice.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ def compute_score(self, gts, res):
temp_dir=os.path.join(cwd, TEMP_DIR)
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
in_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
in_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=temp_dir)
json.dump(input_data, in_file, indent=2)
in_file.close()

# Start job
out_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
out_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=temp_dir)
out_file.close()
cache_dir=os.path.join(cwd, CACHE_DIR)
if not os.path.exists(cache_dir):
Expand Down Expand Up @@ -85,7 +85,7 @@ def compute_score(self, gts, res):
for image_id in imgIds:
# Convert none to NaN before saving scores over subcategories
score_set = {}
for category,score_tuple in imgId_to_scores[image_id].iteritems():
for category,score_tuple in imgId_to_scores[image_id].items():
score_set[category] = {k: self.float_convert(v) for k, v in score_tuple.items()}
scores.append(score_set)
return average_score, scores
Expand Down
Empty file modified pycocoevalcap/tokenizer/__init__.py
100755 → 100644
Empty file.
6 changes: 3 additions & 3 deletions pycocoevalcap/tokenizer/ptbtokenizer.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def tokenize(self, captions_for_image):
# save sentences to temporary file
# ======================================================
path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
tmp_file.write(sentences.encode('ascii','ignore'))
tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=path_to_jar_dirname)
tmp_file.write(sentences)
tmp_file.close()

# ======================================================
Expand All @@ -51,7 +51,7 @@ def tokenize(self, captions_for_image):
p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
stdout=subprocess.PIPE)
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
lines = token_lines.split('\n')
lines = token_lines.decode().split('\n')
# remove temp file
os.remove(tmp_file.name)

Expand Down
Empty file modified pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar
100755 → 100644
Empty file.
Loading