Merge pull request #1 from organization-x/blake

Added initial project.
organization-x · Feb 6, 2024 · 533ecd6 · 533ecd6
2 parents 567d31d + e8ea0b0
commit 533ecd6
Show file tree

Hide file tree

Showing 5 changed files with 428 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,10 @@
-# VideoInterviewAutomation
-Source code for video interview analysis automation.
+# YouTube Interview Analysis Tool
+
+This application evaluates YouTube video interviews to recommend whether the interviewee should be considered for a further interview based on a specific rubric. It leverages the YouTube Transcript API to fetch transcripts, analyzes the content with OpenAI's GPT-4, and provides recommendations through a simple web interface powered by Gradio.
+
+## Features
+
+- **Video ID Extraction**: Extracts the video ID from a YouTube URL.
+- **Transcript Retrieval**: Retrieves the video's transcript along with its total duration and an estimated number of pauses.
+- **GPT-4 Analysis**: Analyzes the transcript data against a predefined rubric to assess the interviewee's performance.
+- **Gradio Interface**: Offers a user-friendly web interface for inputting YouTube URLs and receiving recommendations.
diff --git a/app.py b/app.py
@@ -0,0 +1,124 @@
+from flask import Flask, request, jsonify, make_response
+from requests.auth import HTTPBasicAuth
+from helpers import *
+import requests
+import os
+
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
+
+def post_candidate_analysis_to_lever(analysis_result, candidate_id):
+    """
+    Sends the analysis result of a candidate's video interview to Lever via a POST request.
+
+    This function constructs a request to the Lever API to add a note to a specific opportunity
+    (candidate) identified by the candidate_id. The note contains the result of the machine learning
+    analysis of the candidate's video interview. It handles various exceptions that might occur during
+    the request, logs the attempt and outcome of the request, and ensures that any HTTP or connection
+    errors are caught and logged appropriately.
+
+    Parameters:
+    - analysis_result (str): The result of the video interview analysis to be sent to Lever.
+    - candidate_id (str): The unique identifier for the candidate/opportunity in Lever.
+
+    Returns:
+    - dict: The JSON response from the Lever API if the request is successful.
+    - None: If the request fails due to an exception, the function returns None.
+
+    The function logs an info message before sending the data, and upon successful data transmission.
+    In case of exceptions such as HTTPError, ConnectionError, Timeout, or any other RequestException,
+    it logs the specific error. A general exception catch is also implemented to log any unexpected errors.
+
+    It uses the requests library for making HTTP requests, and the HTTPBasicAuth for authentication.
+    The Lever API key is expected to be available as an environment variable 'LeverKey'.
+    """
+    lever_api_url = 'https://api.lever.co/v1/opportunities/{}/notes'.format(candidate_id)
+    data = {
+        "value": "Video Interview ML Decision: {}".format(analysis_result)
+    }
+
+    try:
+        # Log the attempt to send data
+        logging.info(f"Sending analysis result to Lever for candidate ID {candidate_id}")
+
+        response = requests.post(lever_api_url, auth=HTTPBasicAuth(os.getenv('LeverKey'), ''), json=data)
+
+        # Check if the request was successful
+        response.raise_for_status()
+
+        # Log successful data sending
+        logging.info(f"Successfully sent analysis result to Lever for candidate ID {candidate_id}")
+
+        return response.json()
+    except requests.exceptions.HTTPError as http_err:
+        # Log HTTP errors (e.g., 404, 401, etc.)
+        logging.error(f'HTTP error occurred: {http_err}')
+    except requests.exceptions.ConnectionError as conn_err:
+        # Log connection errors (e.g., DNS failure, refused connection, etc.)
+        logging.error(f'Connection error occurred: {conn_err}')
+    except requests.exceptions.Timeout as timeout_err:
+        # Log timeout errors
+        logging.error(f'Timeout error occurred: {timeout_err}')
+    except requests.exceptions.RequestException as req_err:
+        # Log any other requests-related errors
+        logging.error(f'Error sending data to Lever: {req_err}')
+    except Exception as e:
+        # Catch-all for any other exceptions not related to requests
+        logging.error(f'An unexpected error occurred: {e}')
+
+    # Return None or an appropriate response in case of failure
+    return None
+
+@app.route('/webhook', methods=['POST'])
+def handle_webhook():
+    """
+    Processes incoming webhook POST requests, analyzes video transcripts, and posts results to Lever.
+
+    Validates the presence of required data ('opportunityId') in the request, retrieves the candidate's
+    video URL, analyzes the video transcript, and sends the analysis result to Lever. It handles errors
+    at each step by logging the error and returning an appropriate HTTP response.
+
+    Returns:
+        - A success response with the analysis result and a 200 status code if all operations succeed.
+        - An error response with a relevant message and an appropriate status code (400, 404, 500) if any operation fails.
+    """
+    try:
+        data = request.json
+        if not data:
+            # If no data is received
+            logging.error("No data received in request")
+            return make_response(jsonify({"error": "No data received"}), 400)
+
+        opportunity_id = data.get('opportunityId')
+        if not opportunity_id:
+            # If opportunityId is not provided in the data
+            logging.error("No opportunityId provided")
+            return make_response(jsonify({"error": "No opportunityId provided"}), 400)
+
+        candidate_video_url = get_youtube_url(opportunity_id)
+        if not candidate_video_url:
+            # If no URL is returned for the given opportunity_id
+            logging.error(f"Unable to process video URL for opportunityId {opportunity_id}")
+            analysis_result = "Unable to process the video URL. Currently only YouTube URLs are accepted."
+
+            return jsonify(analysis_result), 200
+
+        analysis_result = analyze_transcript(candidate_video_url)
+        if analysis_result is None:
+            # Handle case where analysis_result is None or an error occurred during analysis
+            logging.error(f"Error analyzing transcript for opportunityId {opportunity_id}")
+            return make_response(jsonify({"error": "Failed to analyze transcript"}), 500)
+
+        send_result = post_candidate_analysis_to_lever(analysis_result, opportunity_id)
+        if send_result is None:
+            # Assuming post_candidate_analysis_to_lever returns None on failure
+            logging.error(f"Failed to send results to Lever for opportunityId {opportunity_id}")
+            return make_response(jsonify({"error": "Failed to send results to Lever"}), 500)
+
+        return jsonify(analysis_result), 200
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}")
+        return make_response(jsonify({"error": "An unexpected error occurred"}), 500)
+
+if __name__ == '__main__':
+    app.run(debug=True, port=5002)
diff --git a/helpers.py b/helpers.py
@@ -0,0 +1,203 @@
+from youtube_transcript_api import YouTubeTranscriptApi
+import openai
+from urllib.parse import urlparse, parse_qs
+import requests
+from requests.auth import HTTPBasicAuth
+import os
+import logging
+
+logging.basicConfig(filename='app.log', filemode='a',
+                    format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
+
+
+def get_video_id_from_url(url):
+    """
+    Extracts the YouTube video ID from a given URL.
+
+    Supports both 'youtube.com' and 'youtu.be' URL formats. For 'youtube.com', it looks for the 'v' query parameter.
+    For 'youtu.be', it extracts the ID directly from the path.
+
+    Parameters:
+        url (str): The full URL of the YouTube video.
+
+    Returns:
+        str: The extracted video ID if found, otherwise None.
+
+    Note:
+        This function silently handles exceptions and returns None if the video ID cannot be extracted.
+    """
+    try:
+        url_data = urlparse(url)
+        if url_data.hostname == 'www.youtube.com' or url_data.hostname == 'youtube.com':
+            query = parse_qs(url_data.query)
+            video_id = query.get("v")
+            if video_id:
+                logging.info(f"Video ID {video_id[0]} extracted from URL.")
+                return video_id[0]
+        elif url_data.hostname == 'youtu.be':
+            # Extract the video ID from the path for youtu.be URLs
+            video_id = url_data.path[1:]  # Remove the leading '/'
+            if video_id:
+                logging.info(f"Video ID {video_id} extracted from URL.")
+                return video_id
+
+        logging.warning(f"No video ID found in URL: {url}")
+        return None
+    except Exception:
+        logging.error(f"Error extracting video ID from URL {url}: {e}")
+        return None
+
+def get_first_youtube_video_url(urls):
+    """
+    Finds and returns the first YouTube video URL from a list of URLs.
+
+    Iterates over a provided list of URLs, checking each for a substring that matches
+    'youtube' or 'youtu.be'. Returns the first URL that matches these criteria.
+
+    Parameters:
+        urls (list of str): A list containing URLs to be checked.
+
+    Returns:
+        str: The first YouTube video URL found in the list, or None if no YouTube URL is found.
+    """
+    for url in urls:
+        if 'youtube' in url or 'youtu.be' in url:
+            return url
+    return None
+
+def get_youtube_url(opportunity_id):
+    """
+    Retrieves the YouTube video URL associated with a given opportunity ID from the Lever API.
+
+    This function makes a GET request to the Lever API to fetch the opportunity details using the provided
+    opportunity ID. It then extracts and returns the first YouTube video URL found in the 'links' section
+    of the opportunity data.
+
+    Parameters:
+        opportunity_id (str): The unique identifier for the opportunity in the Lever system.
+
+    Returns:
+        str: The YouTube video URL associated with the opportunity, or None if no YouTube URL is found.
+
+    Note:
+        Requires the 'LeverKey' environment variable to be set for authentication with the Lever API.
+    """
+    url = 'https://api.lever.co/v1/opportunities/{}'.format(opportunity_id)
+    response = requests.get(url, auth=HTTPBasicAuth(os.getenv('LeverKey'),''))
+
+    links = response.json()['data']['links']
+    youtube_link = get_first_youtube_video_url(links)
+
+    return youtube_link
+
+def parse_decision_to_binary(decision_text):
+    """
+    Converts a decision text to a binary outcome based on the presence of the word 'yes'.
+
+    This function checks if the word 'yes' is present in the provided decision text, performing
+    a case-insensitive comparison. It is designed to interpret a textual decision as a binary
+    outcome, where the presence of 'yes' indicates a positive (True) decision, and its absence
+    indicates a negative (False) decision.
+
+    Parameters:
+        decision_text (str): The decision text to be analyzed.
+
+    Returns:
+        bool: True if 'yes' is present in the decision text, False otherwise.
+    """
+    decision_text_lower = decision_text.lower()
+    return "yes" in decision_text_lower
+
+def get_transcript_data_and_pause_count(video_id):
+    """
+    Fetches a video's transcript, calculates its total duration in minutes, and counts pauses between segments.
+
+    Utilizes the YouTubeTranscriptApi to retrieve the English transcript of a video given its ID, then analyzes
+    the transcript to determine the total duration and estimate the number of pauses based on gaps between
+    transcript segments.
+
+    Parameters:
+        video_id (str): The unique identifier of the YouTube video.
+
+    Returns:
+        tuple: A tuple containing the full transcript text (str), total duration in minutes (int),
+               and the estimated number of pauses (int), or (None, None, None) if an error occurs.
+    """
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
+        if transcript:
+            last_segment = transcript[-1]
+            total_duration = last_segment['start'] + last_segment['duration']
+
+            # Estimate the number of pauses
+            pauses = 0
+            for i in range(1, len(transcript)):
+                current_start = transcript[i]['start']
+                previous_end = transcript[i-1]['start'] + transcript[i-1]['duration']
+                if current_start > previous_end:
+                    pauses += 1
+
+            full_transcript = " ".join(segment['text'] for segment in transcript)
+            logging.info(f"Transcript retrieved successfully for video ID {video_id}.")
+            return full_transcript, total_duration // 60, pauses
+    except Exception as e:
+        logging.error(f"Failed to retrieve transcript for video ID {video_id}. Error: {e}")
+        return None, None, None
+
+def analyze_transcript(url):
+    """
+    Analyzes a YouTube video's transcript for content quality, using a predefined prompt for GPT evaluation.
+
+    This function reads a prompt from 'prompt.txt', extracts the video ID from the provided URL, retrieves the
+    video's transcript and its analysis metrics (total duration and pauses), and evaluates these metrics against
+    a GPT model to determine if the candidate qualifies for an interview.
+
+    Parameters:
+        url (str): The URL of the YouTube video to be analyzed.
+
+    Returns:
+        str: A message indicating whether the candidate qualifies for an interview, an error message if the
+             video URL is invalid or the transcript could not be retrieved, or a detailed error message if
+             any other error occurs during processing.
+    """
+    try:
+        with open('prompt.txt', 'r') as file:
+            prompt = file.read()
+    except Exception as e:
+        logging.error(f"Error opening or reading from 'prompt.txt': {e}")
+        return "Error processing the prompt file."
+
+    try:
+        video_id = get_video_id_from_url(url)
+        if not video_id:
+            logging.error("Invalid URL provided.")
+            return "Unable to process the video URL. Currently only YouTube URLs are accepted."
+
+        full_transcript, total_duration, pauses = get_transcript_data_and_pause_count(
+            video_id)
+
+        if full_transcript is None:  # If there was an error retrieving the transcript
+            logging.error("Error retrieving the transcript.")
+            return pauses
+
+        # Define the prompt for GPT evaluation based on the rubric
+        prompt = prompt.format(full_transcript, pauses, total_duration)
+
+        # Using the new OpenAI client structure
+        client = openai.OpenAI(api_key=os.getenv('OpenAIKey'))
+        response = client.chat.completions.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": prompt}
+            ],
+        )
+
+        decision = parse_decision_to_binary(response.choices[0].message.content.strip())
+
+        if decision:
+            return "The candidate qualifies for an interview."
+        return "The candidate does not qualify for an interview."
+    except Exception as e:
+        logging.error(f"An error occurred during the analysis: {e}")
+        return f"An error occurred during the processing. {e}"