Skip to content

Commit

Permalink
support for cutoff_datefiltering
Browse files Browse the repository at this point in the history
Implemented the '--cutoff-date' CLI and JSON argument  allowing users to specify
a cutoff date. Only sources published after this date will be processed.
This feature enhances the application's flexibility by permitting users to exclude
older content from processing, targeting newer updates within a desired
timeframe. The cutoff date is expected in the YYYY-MM-DD format, streamlining
content selection based on publication date.
  • Loading branch information
kouloumos committed Feb 28, 2024
1 parent 15c1106 commit f87e0c3
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 14 deletions.
2 changes: 1 addition & 1 deletion app/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def __config_source(self):
for entry in rss.entries:
enclosure = next(
(link for link in entry.links if link.get('rel') == 'enclosure'), None)
if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a']:
if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a','audio/mp4']:
published_date = date(*entry.published_parsed[:3])
source = Audio(Source(enclosure.href, self.loc, self.local, entry.title, published_date, self.tags,
self.category, self.speakers, self.preprocess, link=entry.link), description=entry.description)
Expand Down
28 changes: 19 additions & 9 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def add_transcription_source(
loc="misc",
title=None,
date=None,
# cutoff_date serves as a threshold, and only content published beyond this point is relevant
cutoff_date=None,
tags=[],
category=[],
speakers=[],
Expand All @@ -212,6 +214,11 @@ def add_transcription_source(
excluded_media=[]
):
"""Add a source for transcription"""
if cutoff_date:
cutoff_date = utils.validate_and_parse_date(cutoff_date)
# Even with a cutoff date, for YouTube playlists we still need to download the metadata
# for each video in order to obtain the `upload_date` and use it for filtering
self.logger.info(f"A cutoff date of '{cutoff_date}' is given. Processing sources published after this date.")
preprocess = False if self.test_mode else preprocess
transcription_sources = {"added": [], "exist": []}
# check if source is a local file
Expand All @@ -235,27 +242,29 @@ def add_transcription_source(
if source.type == "playlist":
# add a transcript for each source/video in the playlist
for video in source.videos:
if video.media not in excluded_media:
transcription_sources['added'].append(video)
is_eligible = video.date > cutoff_date if cutoff_date else True
if video.media not in excluded_media and is_eligible:
transcription_sources['added'].append(video.source_file)
self._new_transcript_from_source(video)
else:
transcription_sources['exist'].append(video)
transcription_sources['exist'].append(video.source_file)
elif source.type == 'rss':
# add a transcript for each source/audio in the rss feed
for entry in source.entries:
if entry.media not in excluded_media:
transcription_sources['added'].append(entry)
is_eligible = entry.date > cutoff_date if cutoff_date else True
if entry.media not in excluded_media and is_eligible:
transcription_sources['added'].append(entry.source_file)
self._new_transcript_from_source(entry)
else:
transcription_sources['exist'].append(entry)
transcription_sources['exist'].append(entry.source_file)
elif source.type in ['audio', 'video']:
if source.media not in excluded_media:
transcription_sources['added'].append(source)
transcription_sources['added'].append(source.source_file)
self._new_transcript_from_source(source)
self.logger.info(
f"Source added for transcription: {source.title}")
else:
transcription_sources['exist'].append(source)
transcription_sources['exist'].append(source.source_file)
self.logger.info(f"Source already exists: {source.title}")
else:
raise Exception(f"Invalid source: {source_file}")
Expand Down Expand Up @@ -290,7 +299,8 @@ def add_transcription_source_JSON(self, json_file, nocheck=False):
chapters=metadata["chapters"],
link=metadata["media"],
excluded_media=metadata["excluded_media"],
nocheck=nocheck
nocheck=nocheck,
cutoff_date=metadata["cutoff_date"]
)

def start(self, test_transcript=None):
Expand Down
1 change: 1 addition & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def configure_metadata_given_from_JSON(source, from_json=None):
metadata["chapters"] = source.get("chapters", [])
metadata["loc"] = source.get("loc", "")
metadata["date"] = source.get("date", None)
metadata["cutoff_date"] = source.get("cutoff_date", None)
metadata["youtube_metadata"] = source.get("youtube", None)
metadata["media"] = source.get("media", None)
excluded_media = source.get(
Expand Down
34 changes: 30 additions & 4 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ def print_help(ctx, param, value):
default=False,
help="Summarize the transcript [only available with deepgram]",
)
cutoff_date = click.option(
"--cutoff-date",
type=str,
help=("Specify a cutoff date (in YYYY-MM-DD format) to process only sources "
"published after this date. Sources with a publication date on or before "
"the cutoff will be excluded from processing. This option is useful for "
"focusing on newer content or limiting the scope of processing to a "
"specific date range.")
)
github = click.option(
"--github",
type=click.Choice(["remote", "local", "none"]),
Expand Down Expand Up @@ -198,6 +207,8 @@ def print_help(ctx, param, value):
@add_speakers
@add_category
@add_loc
# Options for configuring the transcription preprocess
@cutoff_date
# Options for configuring the transcription postprocess
@github
@upload_to_s3
Expand Down Expand Up @@ -228,6 +239,7 @@ def transcribe(
noqueue: bool,
markdown: bool,
needs_review: bool,
cutoff_date: str,
) -> None:
"""Transcribe the provided sources. Suported sources include: \n
- YouTube videos and playlists\n
Expand Down Expand Up @@ -265,7 +277,14 @@ def transcribe(
transcription.add_transcription_source_JSON(source)
else:
transcription.add_transcription_source(
source_file=source, loc=loc, title=title, date=date, tags=list(tags), category=list(category), speakers=list(speakers),
source_file=source,
loc=loc,
title=title,
date=date,
tags=list(tags),
category=list(category),
speakers=list(speakers),
cutoff_date=cutoff_date
)
transcription.start()
if nocleanup:
Expand All @@ -275,10 +294,13 @@ def transcribe(
except Exception as e:
logger.error(e)
logger.info(f"Exited with error, not cleaning up temp files: {tmp_dir}")
traceback.print_exc()


@cli.command()
@click.argument("source", nargs=1)
# Options for configuring the transcription preprocess
@cutoff_date
@click.option(
"--nocheck",
is_flag=True,
Expand Down Expand Up @@ -307,7 +329,8 @@ def preprocess(
speakers: list,
category: list,
nocheck: bool,
no_batched_output: bool
no_batched_output: bool,
cutoff_date: str
):
"""Preprocess the provided sources. Suported sources include: \n
- YouTube videos and playlists\n
Expand Down Expand Up @@ -335,7 +358,8 @@ def preprocess(
category=category,
speakers=speakers,
preprocess=True,
nocheck=nocheck
nocheck=nocheck,
cutoff_date=cutoff_date
)
if not no_batched_output:
# Batch write all preprocessed sources to JSON
Expand Down Expand Up @@ -391,7 +415,8 @@ def postprocess(
f"Postprocessing {service} transcript from {metadata_json_file}")
with open(metadata_json_file, "r") as outfile:
metadata_json = json.load(outfile)
metadata = utils.configure_metadata_given_from_JSON(metadata_json, from_json=metadata_json_file)
metadata = utils.configure_metadata_given_from_JSON(
metadata_json, from_json=metadata_json_file)
transcription.add_transcription_source(
source_file=metadata["source_file"],
loc=metadata["loc"],
Expand All @@ -404,6 +429,7 @@ def postprocess(
chapters=metadata["chapters"],
link=metadata["media"],
preprocess=False,
cutoff_date=metadata["cutoff_date"]
)
# Finalize transcription service output
transcript_to_postprocess = transcription.transcripts[0]
Expand Down

0 comments on commit f87e0c3

Please sign in to comment.