-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvideo_processing.py
160 lines (133 loc) · 4.94 KB
/
video_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import cv2
from io import BytesIO
from PIL import Image
import together
from llama_index.llms.together import TogetherLLM
from llama_index.schema import Document
from groq import Groq
from moviepy.editor import VideoFileClip
import tempfile
import base64
print("Starting video processing script...")
os.environ['GROQ_API_KEY'] = '' #configure your groq api key here
# Initialize clients
groq_client = Groq()
together.api_key = "" #configure your together.ai api key here
print("Initialized API clients")
# Initialize LLM for text synthesis
llm = TogetherLLM(
model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", api_key=together.api_key
)
print("Initialized LLM for text synthesis")
def extract_frames(video_path, interval=30):
print(f"Extracting frames from {video_path} at {interval} second intervals")
frames = []
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_count % int(fps * interval) == 0:
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
print(f"Extracted frame at {frame_count/fps:.2f} seconds")
frame_count += 1
cap.release()
print(f"Extracted {len(frames)} frames in total")
return frames
def analyze_frames(frames):
print("Analyzing frames...")
client = together.Together(api_key=together.api_key)
descriptions = []
for i, frame in enumerate(frames):
print(f"Analyzing frame {i+1}/{len(frames)}")
buffered = BytesIO()
frame.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode()
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_str}",
},
},
],
}
],
max_tokens=300,
)
descriptions.append(response.choices[0].message.content)
print("Frame analysis complete")
return descriptions
def extract_audio(video_path):
print(f"Extracting audio from {video_path}")
video = VideoFileClip(video_path)
audio = video.audio
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
audio.write_audiofile(temp_audio.name, codec='mp3')
temp_audio_path = temp_audio.name
video.close()
print(f"Audio extracted to {temp_audio_path}")
return temp_audio_path
def transcribe_audio(audio_path):
print(f"Transcribing audio from {audio_path}")
with open(audio_path, "rb") as file:
transcription = groq_client.audio.transcriptions.create(
file=(audio_path, file.read()),
model="distil-whisper-large-v3-en",
response_format="text",
language="en",
temperature=0.0
)
print("Audio transcription complete")
return transcription
def synthesize_results(frame_descriptions, audio_transcription):
print("Synthesizing results...")
prompt = f"""
Synthesize the following information from the first 5 minutes of a video:
Visual content:
{' '.join(frame_descriptions)}
Audio content:
{audio_transcription}
Provide a comprehensive and detailed as posisble of the video content, incorporating both visual and audio information including time stamps if available.
"""
response = llm.complete(prompt)
print("Results synthesis complete")
return response.text
def process_video(video_path):
print(f"Processing video: {video_path}")
# Extract and analyze frames
frames = extract_frames(video_path)
frame_descriptions = analyze_frames(frames)
# Extract audio from video and transcribe
temp_audio_path = extract_audio(video_path)
audio_transcription = transcribe_audio(temp_audio_path)
# Clean up temporary audio file
os.unlink(temp_audio_path)
print(f"Removed temporary audio file: {temp_audio_path}")
# Synthesize results
summary = synthesize_results(frame_descriptions, audio_transcription)
# Create a Document object
doc = Document(
text=summary,
metadata={
"file_name": os.path.basename(video_path),
"file_path": video_path,
"content_type": "video_summary"
}
)
print("Video processing complete")
return doc
# Usage
video_path = "/Users/jrodge/Downloads/videoplayback.mp4"
print(f"Starting video processing for: {video_path}")
print("Final result:")
print(result)