support for cutoff_datefiltering

Implemented the '--cutoff-date' CLI and JSON argument allowing users to specify a cutoff date. Only sources published after this date will be processed. This feature enhances the application's flexibility by permitting users to exclude older content from processing, targeting newer updates within a desired timeframe. The cutoff date is expected in the YYYY-MM-DD format, streamlining content selection based on publication date.
bitcointranscripts · Feb 28, 2024 · f87e0c3 · f87e0c3
1 parent 15c1106
commit f87e0c3
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 14 deletions.
diff --git a/app/transcript.py b/app/transcript.py
@@ -397,7 +397,7 @@ def __config_source(self):
         for entry in rss.entries:
             enclosure = next(
                 (link for link in entry.links if link.get('rel') == 'enclosure'), None)
-            if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a']:
+            if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a','audio/mp4']:
                 published_date = date(*entry.published_parsed[:3])
                 source = Audio(Source(enclosure.href, self.loc, self.local, entry.title, published_date, self.tags,
                                self.category, self.speakers, self.preprocess, link=entry.link), description=entry.description)

diff --git a/app/transcription.py b/app/transcription.py
@@ -201,6 +201,8 @@ def add_transcription_source(
         loc="misc",
         title=None,
         date=None,
+        # cutoff_date serves as a threshold, and only content published beyond this point is relevant
+        cutoff_date=None,
         tags=[],
         category=[],
         speakers=[],
@@ -212,6 +214,11 @@ def add_transcription_source(
         excluded_media=[]
     ):
         """Add a source for transcription"""
+        if cutoff_date:
+            cutoff_date = utils.validate_and_parse_date(cutoff_date)
+            # Even with a cutoff date, for YouTube playlists we still need to download the metadata
+            # for each video in order to obtain the `upload_date` and use it for filtering
+            self.logger.info(f"A cutoff date of '{cutoff_date}' is given. Processing sources published after this date.")
         preprocess = False if self.test_mode else preprocess
         transcription_sources = {"added": [], "exist": []}
         # check if source is a local file
@@ -235,27 +242,29 @@ def add_transcription_source(
         if source.type == "playlist":
             # add a transcript for each source/video in the playlist
             for video in source.videos:
-                if video.media not in excluded_media:
-                    transcription_sources['added'].append(video)
+                is_eligible = video.date > cutoff_date if cutoff_date else True
+                if video.media not in excluded_media and is_eligible:
+                    transcription_sources['added'].append(video.source_file)
                     self._new_transcript_from_source(video)
                 else:
-                    transcription_sources['exist'].append(video)
+                    transcription_sources['exist'].append(video.source_file)
         elif source.type == 'rss':
             # add a transcript for each source/audio in the rss feed
             for entry in source.entries:
-                if entry.media not in excluded_media:
-                    transcription_sources['added'].append(entry)
+                is_eligible = entry.date > cutoff_date if cutoff_date else True
+                if entry.media not in excluded_media and is_eligible:
+                    transcription_sources['added'].append(entry.source_file)
                     self._new_transcript_from_source(entry)
                 else:
-                    transcription_sources['exist'].append(entry)
+                    transcription_sources['exist'].append(entry.source_file)
         elif source.type in ['audio', 'video']:
             if source.media not in excluded_media:
-                transcription_sources['added'].append(source)
+                transcription_sources['added'].append(source.source_file)
                 self._new_transcript_from_source(source)
                 self.logger.info(
                     f"Source added for transcription: {source.title}")
             else:
-                transcription_sources['exist'].append(source)
+                transcription_sources['exist'].append(source.source_file)
                 self.logger.info(f"Source already exists: {source.title}")
         else:
             raise Exception(f"Invalid source: {source_file}")
@@ -290,7 +299,8 @@ def add_transcription_source_JSON(self, json_file, nocheck=False):
                 chapters=metadata["chapters"],
                 link=metadata["media"],
                 excluded_media=metadata["excluded_media"],
-                nocheck=nocheck
+                nocheck=nocheck,
+                cutoff_date=metadata["cutoff_date"]
             )
 
     def start(self, test_transcript=None):

diff --git a/app/utils.py b/app/utils.py
@@ -85,6 +85,7 @@ def configure_metadata_given_from_JSON(source, from_json=None):
         metadata["chapters"] = source.get("chapters", [])
         metadata["loc"] = source.get("loc", "")
         metadata["date"] = source.get("date", None)
+        metadata["cutoff_date"] = source.get("cutoff_date", None)
         metadata["youtube_metadata"] = source.get("youtube", None)
         metadata["media"] = source.get("media", None)
         excluded_media = source.get(

diff --git a/transcriber.py b/transcriber.py
@@ -89,6 +89,15 @@ def print_help(ctx, param, value):
     default=False,
     help="Summarize the transcript [only available with deepgram]",
 )
+cutoff_date = click.option(
+    "--cutoff-date",
+    type=str,
+    help=("Specify a cutoff date (in YYYY-MM-DD format) to process only sources "
+          "published after this date. Sources with a publication date on or before "
+          "the cutoff will be excluded from processing. This option is useful for "
+          "focusing on newer content or limiting the scope of processing to a "
+          "specific date range.")
+)
 github = click.option(
     "--github",
     type=click.Choice(["remote", "local", "none"]),
@@ -198,6 +207,8 @@ def print_help(ctx, param, value):
 @add_speakers
 @add_category
 @add_loc
+# Options for configuring the transcription preprocess
+@cutoff_date
 # Options for configuring the transcription postprocess
 @github
 @upload_to_s3
@@ -228,6 +239,7 @@ def transcribe(
     noqueue: bool,
     markdown: bool,
     needs_review: bool,
+    cutoff_date: str,
 ) -> None:
     """Transcribe the provided sources. Suported sources include: \n
     - YouTube videos and playlists\n
@@ -265,7 +277,14 @@ def transcribe(
             transcription.add_transcription_source_JSON(source)
         else:
             transcription.add_transcription_source(
-                source_file=source, loc=loc, title=title, date=date, tags=list(tags), category=list(category), speakers=list(speakers),
+                source_file=source,
+                loc=loc,
+                title=title,
+                date=date,
+                tags=list(tags),
+                category=list(category),
+                speakers=list(speakers),
+                cutoff_date=cutoff_date
             )
         transcription.start()
         if nocleanup:
@@ -275,10 +294,13 @@ def transcribe(
     except Exception as e:
         logger.error(e)
         logger.info(f"Exited with error, not cleaning up temp files: {tmp_dir}")
+        traceback.print_exc()
 
 
 @cli.command()
 @click.argument("source", nargs=1)
+# Options for configuring the transcription preprocess
+@cutoff_date
 @click.option(
     "--nocheck",
     is_flag=True,
@@ -307,7 +329,8 @@ def preprocess(
     speakers: list,
     category: list,
     nocheck: bool,
-    no_batched_output: bool
+    no_batched_output: bool,
+    cutoff_date: str
 ):
     """Preprocess the provided sources. Suported sources include: \n
     - YouTube videos and playlists\n
@@ -335,7 +358,8 @@ def preprocess(
                 category=category,
                 speakers=speakers,
                 preprocess=True,
-                nocheck=nocheck
+                nocheck=nocheck,
+                cutoff_date=cutoff_date
             )
         if not no_batched_output:
             # Batch write all preprocessed sources to JSON
@@ -391,7 +415,8 @@ def postprocess(
             f"Postprocessing {service} transcript from {metadata_json_file}")
         with open(metadata_json_file, "r") as outfile:
             metadata_json = json.load(outfile)
-        metadata = utils.configure_metadata_given_from_JSON(metadata_json, from_json=metadata_json_file)
+        metadata = utils.configure_metadata_given_from_JSON(
+            metadata_json, from_json=metadata_json_file)
         transcription.add_transcription_source(
             source_file=metadata["source_file"],
             loc=metadata["loc"],
@@ -404,6 +429,7 @@ def postprocess(
             chapters=metadata["chapters"],
             link=metadata["media"],
             preprocess=False,
+            cutoff_date=metadata["cutoff_date"]
         )
         # Finalize transcription service output
         transcript_to_postprocess = transcription.transcripts[0]