Adding support for news items

internetarchive · Sep 19, 2024 · 969de0f · 969de0f
1 parent 0d9a212
commit 969de0f
Show file tree

Hide file tree

Showing 3 changed files with 225 additions and 67 deletions.
diff --git a/iiify/app.py b/iiify/app.py
@@ -3,12 +3,12 @@
 import os
 import time
 import requests
-from flask import Flask, send_file, jsonify, abort, request, render_template, redirect
+from flask import Flask, send_file, jsonify, abort, request, render_template, redirect, make_response
 from flask_cors import CORS
 from flask_caching import Cache
 from iiif2 import iiif, web
 from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
- purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
+ purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations, create_vtt_stream
 from .configs import options, cors, approot, cache_root, media_root, \
  cache_expr, version, image_server, cache_timeouts
 from urllib.parse import quote
@@ -197,6 +197,13 @@ def annnotations(version, identifier, fileName, canvas_no):
  domain = purify_domain(request.args.get('domain', request.url_root))
  return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))
 
+@app.route('/iiif/vtt/streaming/<identifier>.vtt')
+@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
+def vtt_stream(identifier):
+ response = make_response(create_vtt_stream(identifier))
+ response.headers['Content-Type'] = 'text/vtt'
+ return response
+
 @app.route('/iiif/<identifier>/manifest.json')
 @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
 def manifest(identifier):

diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -12,6 +12,7 @@
 import math 
 import re
 import xml.etree.ElementTree as ET
+from datetime import timedelta
 
 IMG_CTX = 'http://iiif.io/api/image/2/context.json'
 PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
@@ -658,77 +659,131 @@ def create_manifest3(identifier, domain=None, page=None):
  vttfiles[sourceFilename] = [] 
 
  vttfiles[sourceFilename].append(f) 
-
- # create the canvases for each original
- for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
- normalised_id = file['name'].rsplit(".", 1)[0]
+
+ if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']:
+ # this is a news item so has to be treated differently
+ # https://ia801803.us.archive.org/29/items/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.mp4?start=0&end=360&ignore=x.mp4&cnt=0
+ mp4File = None
+ duration = 0.0
+ filedata = None
+ for file in metadata['files']:
+ if file['name'].endswith('.mp4'):
+ mp4File = file['name']
+ duration = float(file['length'])
+ filedata = file
+
+ normalised_id = mp4File.rsplit(".", 1)[0]
  slugged_id = normalised_id.replace(" ", "-")
  c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
- c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
-
- # Add vtt if present
- if vttfiles and normalised_id in vttfiles:
- vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
-
- vttNo = 1
- for vttFile in vttfiles[normalised_id]:
- vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
- motivation="supplementing", 
- target=c.id, 
- anno_page_id=vttAPId,
- body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
- "type": "Text",
- "format": "text/vtt",
- })
- # add label and language
- if vttFile['name'].endswith("autogenerated.vtt"):
- vtAnno.body.label = { 'en': ['autogenerated']}
- else:
- # Assume language
- splitName = vttFile['name'].split(".")
- lang = splitName[-2]
- vtAnno.body.add_label(lang, language="none")
- vtAnno.body.language = lang
-
- vttNo += 1
-
- # create intermediary objects
+ c = Canvas(id=c_id, label=normalised_id, duration=duration, height=int(filedata['height']), width=int(filedata['width'])) 
  ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
- anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
 
- # create body based on whether there are derivatives or not:
- if file['name'] in derivatives:
- body = Choice(items=[])
- # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
- for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
- if format in derivatives[file['name']]:
- r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
- type='Video',
- format=to_mimetype(format),
- label={"none": [format]},
- duration=float(file['length']), 
- height=int(file['height']),
- width=int(file['width']), 
- )
- body.items.append(r)
- elif file['format'] == format:
- r = ResourceItem(
- id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
- type='Video',
- format=to_mimetype(format),
- label={"none": [format]},
- duration=float(file['length']),
- height=int(file['height']),
- width=int(file['width']))
- body.items.append(r)
- else:
- # todo: deal with instances where there are no derivatives for whatever reason
- pass
-
- anno.body = body
- ap.add_item(anno)
+ vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+ vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/streamed", 
+ motivation="supplementing", 
+ target=c.id, 
+ anno_page_id=vttAPId,
+ body={"id": f"{domain}vtt/streaming/{identifier}.vtt",
+ "type": "Text",
+ "format": "text/vtt",
+ })
+
+ segments = math.floor(duration / 60)
+ for i in range(segments):
+ start = i * 60
+ if i == segments - 1:
+ end = int(duration)
+ else:
+ end = (i + 1) * 60
+
+ #print (f"Start: {start} End: {end}, Duration: {float(end) - float(start)} full duration: {duration}")
+ anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/{i}", motivation="painting", target=f"{c.id}#t={start},{end}")
+ streamurl = f"https://{metadata['server']}{metadata['dir']}/{mp4File}?start={start}&end={end}&ignore=x.mp4&cnt=0" 
+ body = ResourceItem(id=streamurl,
+ type='Video',
+ format="video/mp4",
+ label={"en": [f"Part {i + 1} of {segments}"]},
+ duration=end - start, 
+ height=int(filedata['height']),
+ width=int(filedata['width']), 
+ )
+
+ anno.body = body
+ ap.add_item(anno)
+
  c.add_item(ap)
  manifest.add_item(c)
+ else:
+ # create the canvases for each original
+ for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
+ normalised_id = file['name'].rsplit(".", 1)[0]
+ slugged_id = normalised_id.replace(" ", "-")
+ c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
+ c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
+
+ # Add vtt if present
+ if vttfiles and normalised_id in vttfiles:
+ vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+
+ vttNo = 1
+ for vttFile in vttfiles[normalised_id]:
+ vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
+ motivation="supplementing", 
+ target=c.id, 
+ anno_page_id=vttAPId,
+ body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
+ "type": "Text",
+ "format": "text/vtt",
+ })
+ # add label and language
+ if vttFile['name'].endswith("autogenerated.vtt"):
+ vtAnno.body.label = { 'en': ['autogenerated']}
+ else:
+ # Assume language
+ splitName = vttFile['name'].split(".")
+ lang = splitName[-2]
+ vtAnno.body.add_label(lang, language="none")
+ vtAnno.body.language = lang
+
+ vttNo += 1
+
+ # create intermediary objects
+ ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
+ anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
+
+ # create body based on whether there are derivatives or not:
+ if file['name'] in derivatives:
+ body = Choice(items=[])
+ # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
+ for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
+ if format in derivatives[file['name']]:
+ r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
+ type='Video',
+ format=to_mimetype(format),
+ label={"none": [format]},
+ duration=float(file['length']), 
+ height=int(file['height']),
+ width=int(file['width']), 
+ )
+ body.items.append(r)
+ elif file['format'] == format:
+ r = ResourceItem(
+ id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}",
+ type='Video',
+ format=to_mimetype(format),
+ label={"none": [format]},
+ duration=float(file['length']),
+ height=int(file['height']),
+ width=int(file['width']))
+ body.items.append(r)
+ else:
+ # todo: deal with instances where there are no derivatives for whatever reason
+ pass
+
+ anno.body = body
+ ap.add_item(anno)
+ c.add_item(ap)
+ manifest.add_item(c)
  elif mediatype == "collection":
  raise IsCollection
  else:
@@ -785,6 +840,73 @@ def create_annotations(version, identifier, fileName, canvas_no, domain=None):
 
  return json.loads(annotationPage.jsonld())
 
+def create_vtt_stream(identifier): 
+ """
+ This method will read a SRT file using the following URL:
+ https://archive.org/download/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.cc5.srt?t=0/360
+ and convert it to vtt. The streaming text above takes seconds as a parameter. 
+ """ 
+
+ metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
+ filename = ""
+ duration = 0.0
+ for file in metadata['files']:
+ if file['name'].endswith('.mpg') and file['source'] == 'original':
+ duration = float(file['length'])
+ # There seems to be multiple srt files but unclear how they are different 
+ if file['name'].endswith('.srt'): 
+ filename = file['name']
+
+ # Initialize the vtt content with the WEBVTT header
+ vtt_content = ["WEBVTT\n"]
+
+ segments = math.floor(duration / 60)
+ for i in range(segments):
+ start = i * 60
+ if i == segments - 1:
+ end = int(duration)
+ else:
+ end = (i + 1) * 60
+
+
+ response = requests.get(f"https://archive.org/download/{identifier}/{filename}?t={start}/{end}")
+
+ if response.status_code == 200:
+ # Get the content of the SRT file as a string
+ srt_content = response.text 
+ # Split the srt file by lines
+ lines = srt_content.splitlines()
+ for line in lines:
+ # Convert time format: 00:00:00,000 -> 00:00:00.000
+ if "-->" in line:
+ splitline = line.split("-->")
+ starttime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
+ endtime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start)
+ line = f"{formatTimeVTT(starttime)} -> {formatTimeVTT(endtime)}"
+
+ vtt_content.append(line)
+
+ vtt_content.append(" ")
+
+ # Join the list into a single string
+ return "\n".join(vtt_content)
+
+def formatTimeVTT(time):
+ hours, remainder = divmod(time.total_seconds(), 3600)
+ minutes, seconds = divmod(remainder, 60)
+ return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(time.microseconds / 1000):03}"
+
+def timeToDelta(time):
+ """
+ Convert SRT formated times to timedelta
+ """
+ milliseconds = int(time.split(",")[1])
+ timeStr = time.split(",")[0]
+ hour = int(timeStr.split(":")[0])
+ minute = int(timeStr.split(":")[1])
+ second = int(timeStr.split(":")[2])
+ return timedelta(hours=hour, minutes=minute, seconds=second, milliseconds=milliseconds)
+
 def coerce_list(value):
  if isinstance(value, list):
  return ". ".join(value)

diff --git a/tests/test_video.py b/tests/test_video.py
@@ -2,6 +2,7 @@
 os.environ["FLASK_CACHE_DISABLE"] = "true"
 
 import unittest
+import math
 from flask.testing import FlaskClient
 from iiify.app import app
 
@@ -66,6 +67,34 @@ def test_vtt_multilingual(self):
  if item['body']['language'] == 'cy':
  self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file')
 
+ def test_newsitem(self):
+ resp = self.test_app.get("/iiif/3/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/manifest.json")
+ self.assertEqual(resp.status_code, 200)
+ manifest = resp.json
+
+ canvas = manifest['items'][0]
+ annoPages = canvas['items'][0]
+ annotations = annoPages['items']
+ self.assertEqual(len(annotations), math.floor(780.89 / 60), 'Expected the video to contain the 13min video split into 1 minute segments')
+
+ # Check vtt file
+ self.assertTrue('annotations' in canvas, "Expected canvas to have annotations")
+ vttFile = canvas['annotations'][0]['items'][0]['body']['id']
+ self.assertTrue(vttFile.endswith("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt"),f"Expected vttFile to be located at /iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt but found it at {vttFile}")
+
+ resp = self.test_app.get("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt")
+ checkLine=False
+ for line in resp.text.split("\n"):
+ if checkLine:
+ self.assertEqual("00:01:02.000 -> 00:01:02.000", line, "Expected the timecode to be over a minute as its the second video")
+ break 
+ if line.startswith("28"):
+ checkLine=True
+ # 28
+ # 00:01:02.000 -> 00:01:02.000
+ # I AM THE DIRECTOR OF ARCHAEOLOGY
+
+
 
 if __name__ == '__main__':
  unittest.main()