Skip to content

Commit

Permalink
Adding support for news items
Browse files Browse the repository at this point in the history
  • Loading branch information
glenrobson committed Sep 19, 2024
1 parent 0d9a212 commit 969de0f
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 67 deletions.
11 changes: 9 additions & 2 deletions iiify/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import os
import time
import requests
from flask import Flask, send_file, jsonify, abort, request, render_template, redirect
from flask import Flask, send_file, jsonify, abort, request, render_template, redirect, make_response
from flask_cors import CORS
from flask_caching import Cache
from iiif2 import iiif, web
from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations, create_vtt_stream
from .configs import options, cors, approot, cache_root, media_root, \
cache_expr, version, image_server, cache_timeouts
from urllib.parse import quote
Expand Down Expand Up @@ -197,6 +197,13 @@ def annnotations(version, identifier, fileName, canvas_no):
domain = purify_domain(request.args.get('domain', request.url_root))
return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))

@app.route('/iiif/vtt/streaming/<identifier>.vtt')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
def vtt_stream(identifier):
response = make_response(create_vtt_stream(identifier))
response.headers['Content-Type'] = 'text/vtt'
return response

@app.route('/iiif/<identifier>/manifest.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
def manifest(identifier):
Expand Down
252 changes: 187 additions & 65 deletions iiify/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import math
import re
import xml.etree.ElementTree as ET
from datetime import timedelta

IMG_CTX = 'http://iiif.io/api/image/2/context.json'
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
Expand Down Expand Up @@ -658,77 +659,131 @@ def create_manifest3(identifier, domain=None, page=None):
vttfiles[sourceFilename] = []

vttfiles[sourceFilename].append(f)

# create the canvases for each original
for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
normalised_id = file['name'].rsplit(".", 1)[0]

if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']:
# this is a news item so has to be treated differently
# https://ia801803.us.archive.org/29/items/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.mp4?start=0&end=360&ignore=x.mp4&cnt=0
mp4File = None
duration = 0.0
filedata = None
for file in metadata['files']:
if file['name'].endswith('.mp4'):
mp4File = file['name']
duration = float(file['length'])
filedata = file

normalised_id = mp4File.rsplit(".", 1)[0]
slugged_id = normalised_id.replace(" ", "-")
c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))

# Add vtt if present
if vttfiles and normalised_id in vttfiles:
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"

vttNo = 1
for vttFile in vttfiles[normalised_id]:
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}",
motivation="supplementing",
target=c.id,
anno_page_id=vttAPId,
body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
"type": "Text",
"format": "text/vtt",
})
# add label and language
if vttFile['name'].endswith("autogenerated.vtt"):
vtAnno.body.label = { 'en': ['autogenerated']}
else:
# Assume language
splitName = vttFile['name'].split(".")
lang = splitName[-2]
vtAnno.body.add_label(lang, language="none")
vtAnno.body.language = lang

vttNo += 1

# create intermediary objects
c = Canvas(id=c_id, label=normalised_id, duration=duration, height=int(filedata['height']), width=int(filedata['width']))
ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)

# create body based on whether there are derivatives or not:
if file['name'] in derivatives:
body = Choice(items=[])
# add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
if format in derivatives[file['name']]:
r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
type='Video',
format=to_mimetype(format),
label={"none": [format]},
duration=float(file['length']),
height=int(file['height']),
width=int(file['width']),
)
body.items.append(r)
elif file['format'] == format:
r = ResourceItem(
id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
type='Video',
format=to_mimetype(format),
label={"none": [format]},
duration=float(file['length']),
height=int(file['height']),
width=int(file['width']))
body.items.append(r)
else:
# todo: deal with instances where there are no derivatives for whatever reason
pass

anno.body = body
ap.add_item(anno)
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/streamed",
motivation="supplementing",
target=c.id,
anno_page_id=vttAPId,
body={"id": f"{domain}vtt/streaming/{identifier}.vtt",
"type": "Text",
"format": "text/vtt",
})

segments = math.floor(duration / 60)
for i in range(segments):
start = i * 60
if i == segments - 1:
end = int(duration)
else:
end = (i + 1) * 60

#print (f"Start: {start} End: {end}, Duration: {float(end) - float(start)} full duration: {duration}")
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/{i}", motivation="painting", target=f"{c.id}#t={start},{end}")
streamurl = f"https://{metadata['server']}{metadata['dir']}/{mp4File}?start={start}&end={end}&ignore=x.mp4&cnt=0"
body = ResourceItem(id=streamurl,
type='Video',
format="video/mp4",
label={"en": [f"Part {i + 1} of {segments}"]},
duration=end - start,
height=int(filedata['height']),
width=int(filedata['width']),
)

anno.body = body
ap.add_item(anno)

c.add_item(ap)
manifest.add_item(c)
else:
# create the canvases for each original
for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
normalised_id = file['name'].rsplit(".", 1)[0]
slugged_id = normalised_id.replace(" ", "-")
c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))

# Add vtt if present
if vttfiles and normalised_id in vttfiles:
vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"

vttNo = 1
for vttFile in vttfiles[normalised_id]:
vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}",
motivation="supplementing",
target=c.id,
anno_page_id=vttAPId,
body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
"type": "Text",
"format": "text/vtt",
})
# add label and language
if vttFile['name'].endswith("autogenerated.vtt"):
vtAnno.body.label = { 'en': ['autogenerated']}
else:
# Assume language
splitName = vttFile['name'].split(".")
lang = splitName[-2]
vtAnno.body.add_label(lang, language="none")
vtAnno.body.language = lang

vttNo += 1

# create intermediary objects
ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)

# create body based on whether there are derivatives or not:
if file['name'] in derivatives:
body = Choice(items=[])
# add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
if format in derivatives[file['name']]:
r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
type='Video',
format=to_mimetype(format),
label={"none": [format]},
duration=float(file['length']),
height=int(file['height']),
width=int(file['width']),
)
body.items.append(r)
elif file['format'] == format:
r = ResourceItem(
id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
type='Video',
format=to_mimetype(format),
label={"none": [format]},
duration=float(file['length']),
height=int(file['height']),
width=int(file['width']))
body.items.append(r)
else:
# todo: deal with instances where there are no derivatives for whatever reason
pass

anno.body = body
ap.add_item(anno)
c.add_item(ap)
manifest.add_item(c)
elif mediatype == "collection":
raise IsCollection
else:
Expand Down Expand Up @@ -785,6 +840,73 @@ def create_annotations(version, identifier, fileName, canvas_no, domain=None):

return json.loads(annotationPage.jsonld())

def create_vtt_stream(identifier):
"""
This method will read a SRT file using the following URL:
https://archive.org/download/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.cc5.srt?t=0/360
and convert it to vtt. The streaming text above takes seconds as a parameter.
"""

metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
filename = ""
duration = 0.0
for file in metadata['files']:
if file['name'].endswith('.mpg') and file['source'] == 'original':
duration = float(file['length'])
# There seems to be multiple srt files but unclear how they are different
if file['name'].endswith('.srt'):
filename = file['name']

# Initialize the vtt content with the WEBVTT header
vtt_content = ["WEBVTT\n"]

segments = math.floor(duration / 60)
for i in range(segments):
start = i * 60
if i == segments - 1:
end = int(duration)
else:
end = (i + 1) * 60


response = requests.get(f"https://archive.org/download/{identifier}/{filename}?t={start}/{end}")

if response.status_code == 200:
# Get the content of the SRT file as a string
srt_content = response.text
# Split the srt file by lines
lines = srt_content.splitlines()
for line in lines:
# Convert time format: 00:00:00,000 -> 00:00:00.000
if "-->" in line:
splitline = line.split("-->")
starttime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
endtime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
line = f"{formatTimeVTT(starttime)} -> {formatTimeVTT(endtime)}"

vtt_content.append(line)

vtt_content.append(" ")

# Join the list into a single string
return "\n".join(vtt_content)

def formatTimeVTT(time):
hours, remainder = divmod(time.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(time.microseconds / 1000):03}"

def timeToDelta(time):
"""
Convert SRT formated times to timedelta
"""
milliseconds = int(time.split(",")[1])
timeStr = time.split(",")[0]
hour = int(timeStr.split(":")[0])
minute = int(timeStr.split(":")[1])
second = int(timeStr.split(":")[2])
return timedelta(hours=hour, minutes=minute, seconds=second, milliseconds=milliseconds)

def coerce_list(value):
if isinstance(value, list):
return ". ".join(value)
Expand Down
29 changes: 29 additions & 0 deletions tests/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
os.environ["FLASK_CACHE_DISABLE"] = "true"

import unittest
import math
from flask.testing import FlaskClient
from iiify.app import app

Expand Down Expand Up @@ -66,6 +67,34 @@ def test_vtt_multilingual(self):
if item['body']['language'] == 'cy':
self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file')

def test_newsitem(self):
resp = self.test_app.get("/iiif/3/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/manifest.json")
self.assertEqual(resp.status_code, 200)
manifest = resp.json

canvas = manifest['items'][0]
annoPages = canvas['items'][0]
annotations = annoPages['items']
self.assertEqual(len(annotations), math.floor(780.89 / 60), 'Expected the video to contain the 13min video split into 1 minute segments')

# Check vtt file
self.assertTrue('annotations' in canvas, "Expected canvas to have annotations")
vttFile = canvas['annotations'][0]['items'][0]['body']['id']
self.assertTrue(vttFile.endswith("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt"),f"Expected vttFile to be located at /iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt but found it at {vttFile}")

resp = self.test_app.get("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt")
checkLine=False
for line in resp.text.split("\n"):
if checkLine:
self.assertEqual("00:01:02.000 -> 00:01:02.000", line, "Expected the timecode to be over a minute as its the second video")
break
if line.startswith("28"):
checkLine=True
# 28
# 00:01:02.000 -> 00:01:02.000
# I AM THE DIRECTOR OF ARCHAEOLOGY



if __name__ == '__main__':
unittest.main()

0 comments on commit 969de0f

Please sign in to comment.