-
Notifications
You must be signed in to change notification settings - Fork 0
/
Volume Suggester.py
449 lines (367 loc) · 15.3 KB
/
Volume Suggester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
import subprocess
my_libraries = ['time', 'pyaudio', 'pynput', 'wave', 'numpy', 'matplotlib', 'librosa', 'mutagen', 'tkinter', 'pathlib']
for module in my_libraries:
try:
__import__(module)
except ImportError:
subprocess.check_call(["pip", "install", module])
## importing necessary library
from tkinter import filedialog, Tk
from playsound import playsound
import pyaudio
import time
from pynput import keyboard
from pydub import AudioSegment
import wave
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import mutagen
from mutagen.wave import WAVE
import pathlib
from pathlib import Path
## declared necessary global variable
file = "" ## for selected file path
wav_file = "" ## for converted wav path
plot_path = "" ## path to store the graph plots
new_dir = "" ## to create a new directory of output
paused = False ## to keep track of audio being paused
stopped = False ## to keep track of audio being stopped
avg_rms = 0.0 ## to store the average rms value
## function to convert .mp3 to .wav file
def mp3towav():
global file, wav_file, plot_path, new_dir
dir_name = os.path.dirname(file)
base_file_name = Path(file).stem
wav_file = dir_name + "/" + base_file_name + "_wav" + ".wav"
print("\nConverted .mp3 to .wav file and saved to same location from where .mp3 file selected...")
print("Created : " + base_file_name + "_wav" + ".wav")
# print(wav_file)
subprocess.call(['C:/Users/aakas/PATH_Programs/ffmpeg-master-latest-win64-gpl/bin/ffmpeg.exe', '-i', file, wav_file])
## creating folder for saving output plot
new_dir = pathlib.Path(dir_name, base_file_name + " - Plot")
new_dir.mkdir(parents=True, exist_ok=True)
plot_path = os.path.dirname(file)
# print(plot_path)
## function defined to play and pause the audio file
'''
ctrl - to play and pause
esc - to stop
'''
def play_pause_stop():
global paused, stopped, file, wav_file
print("\nPlayer Starts...")
## opening .wav audio fle
wf = wave.open(wav_file, 'rb')
# instantiate PyAudio
p = pyaudio.PyAudio()
# define callback
def callback(in_data, frame_count, time_info, status):
data = wf.readframes(frame_count)
return (data, pyaudio.paContinue)
## on_press function defined
def on_press(key):
global paused, stopped
# print (key)
if key == keyboard.Key.ctrl_l or key == keyboard.Key.ctrl_r:
if stream.is_stopped(): # time to play audio
print ('[CTRL Pressed] Audio Playing...')
stream.start_stream()
paused = False
return False
elif stream.is_active(): # time to pause audio
print ('[CTRL Pressed] Audio Paused...')
stream.stop_stream()
paused = True
return False
elif key == keyboard.Key.esc:
print("[ESC Pressed] Audio Stopped...")
stopped = True
return False
return False
# open stream using callback
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
stream_callback=callback)
# start the stream
stream.start_stream()
print("Stream Starts...")
while stream.is_active() or paused==True:
if stopped==True:
break
with keyboard.Listener(on_press=on_press) as listener:
listener.join()
time.sleep(0.1)
# stop stream
stream.stop_stream()
stream.close()
wf.close()
print("Stream Ends...")
# close PyAudio
p.terminate()
## function to get the audio duration
def audio_duration(length):
hours = length // 3600 # calculate in hours
length %= 3600
mins = length // 60 # calculate in minutes
length %= 60
seconds = length # calculate in seconds
return hours, mins, seconds # returns the duration
## function defined to extract generic features of audio file
'''
Channels: number of channels; 1 for mono, 2 for stereo audio
Sample width: number of bytes per sample; 1 means 8-bit, 2 means 16-bit
Frame rate/Sample rate: frequency of samples used (in Hertz)
Frame width: Number of bytes for each “frame”. One frame contains a sample for each channel.
Length: audio file length (in milliseconds)
Frame count: the number of frames from the sample
Intensity: loudness in dBFS (dB relative to the maximum possible loudness)
'''
def extract():
global file, wav_file
# Load files
audio_segment = AudioSegment.from_file(wav_file)
# Printing attributes
print("\nAudio Generic Features : ")
print(f"Channels: {audio_segment.channels}")
print(f"Sample width: {audio_segment.sample_width}")
print(f"Frame rate (sample rate): {audio_segment.frame_rate}")
print(f"Frame width: {audio_segment.frame_width}")
print(f"Duration (sec): {audio_segment.frame_count()/audio_segment.frame_rate}")
audio = WAVE(wav_file)
audio_info = audio.info
length = int(audio_info.length)
hours, mins, seconds = audio_duration(length)
print('Total Duration: {}:{}:{}'.format(hours, mins, seconds))
print(f"Length (ms): {len(audio_segment)}")
print(f"Frame count: {audio_segment.frame_count()}")
print(f"Intensity: {audio_segment.dBFS}")
## function defined to generate amplitude wave
def amplitude_wave():
global file, wav_file, plot_path, new_dir
print("\nGenerated Amplitude Wave Plot...")
file_name = os.path.basename(wav_file)
# Open wav file and read frames as bytes
sf_filewave = wave.open(wav_file, 'r')
signal_sf = sf_filewave.readframes(-1)
# Convert audio bytes to integers
soundwave_sf = np.frombuffer(signal_sf, dtype='int16')
# Get the sound wave frame rate
framerate_sf = sf_filewave.getframerate()
# Find the sound wave timestamps
time_sf = np.linspace(start=0, stop=len(soundwave_sf) / framerate_sf, num=len(soundwave_sf))
# Set up plot
f, ax = plt.subplots(figsize=(15, 5))
# Setup the title and axis titles
plt.title(file_name + ' - Amplitude over Time')
plt.ylabel('Amplitude')
plt.xlabel('Time (sec)')
# Add the audio data to the plot
plt.plot(time_sf, soundwave_sf, label='Amplitude', alpha=0.5)
plt.legend()
## saving the plot
# print(plot_path + "/output.jpg")
# print(str(new_dir) + "\\" + file_name + ' - Amplitude over Time.jpg')
print("Saved " + file_name + ' - Amplitude over Time.jpg...')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Amplitude over Time.jpg')
plt.show()
## function defined to generate spectogram graph
'''
A visual representation of the spectrum of frequencies of a signal as it varies with time.
The vertical axis shows frequency, the horizontal axis shows the time of the clip, and the color variation shows the intensity of the audio wave.
'''
def spectogram():
global file, wav_file, plot_path, new_dir
print("\nGenerated Spectogram Plot...")
file_name = os.path.basename(wav_file)
x, sr = librosa.load(wav_file)
# Spectrogram of frequency
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(15, 5))
plt.title(file_name + ' - Spectogram')
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - Spectogram.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Spectogram.jpg')
plt.show()
## function defined to generate RMS/Enerygy Spectogram
'''
Root Mean Square refers to total magnitude of the signal, which in layman terms can be interpreted as the loudness or energy parameter of the audio file.
For loud and rock music RMS value is high
'''
def rms_energy_spectogram():
global file, wav_file, plot_path, new_dir, avg_rms
print("\nGenerated RMS/Energy Spectogram Plot...")
file_name = os.path.basename(wav_file)
y, sr = librosa.load(wav_file)
# Get RMS value from each frame's magnitude value
S, phase = librosa.magphase(librosa.stft(y))
rms = librosa.feature.rms(S=S)
# Plot the RMS energy
fig, ax = plt.subplots(figsize=(15, 6), nrows=2, sharex=True)
times = librosa.times_like(rms)
ax[0].semilogy(times, rms[0], label='RMS Energy')
## calculating average rms
for i in rms[0]:
avg_rms = avg_rms + i
avg_rms = avg_rms/len(times)
# print(avg_rms)
ax[0].set(title=file_name + ' - RMS Energy', xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time', ax=ax[1])
ax[1].set(title=file_name + ' - log Power spectrogram')
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - RMS Energy Spectogram.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - RMS Energy Spectogram.jpg')
plt.show()
## function defined to generate graph for ZCR(Zero Crossing Rate)
'''
ZCR is the rate at which a signal changes from positive to zero to negative or from negative to zero to positive.
Its value has been widely used in both speech recognition and music information retrieval, being a key feature to classify percussive sounds.
Highly percussive sounds like rock, metal, emo, or punk music tend to have higher zero-crossing rate values.
For loud and rock music ZCR value is high
'''
def zero_crossing_rate():
global file, wav_file, plot_path, new_dir
print("\nGenerated Zero Crossing Rate Plot...")
file_name = os.path.basename(wav_file)
y, sr = librosa.load(wav_file)
zcrs = librosa.feature.zero_crossing_rate(y)
# print(f"Zero crossing rate: {sum(librosa.zero_crossings(y))}")
plt.figure(figsize=(15, 3))
plt.plot(zcrs[0])
plt.title(file_name + ' - Zero Crossing Rate')
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - Zero Crossing Rate.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Zero Crossing Rate.jpg')
plt.show()
## function defined to generate graph for MFCC(Mel-Frequency Cepstral Coefficients)
'''
MFCC is a representation of the short-term power spectrum of a sound, based on some transformation in a Mel-scale.
It is commonly used in speech recognition as people’s voices are usually on a certain range of frequency and different from one to another.
The MFCCs values on human speech seem to be lower and more dynamic than the music files.
'''
def mel_frequency_cepstral_coefficients():
global file, wav_file, plot_path, new_dir
print("\nGenerated Mel Frequency Cepstral Coefficients Plot...")
file_name = os.path.basename(wav_file)
x, sr = librosa.load(wav_file)
mfccs = librosa.feature.mfcc(y=x, sr=sr)
# Displaying the MFCCs:
fig, ax = plt.subplots(figsize=(15, 3))
img = librosa.display.specshow(mfccs, sr=sr, x_axis='time')
fig.colorbar(img, ax=ax)
ax.set(title=file_name + ' - Mel Frequency Cepstral Coefficients')
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - Mel Frequency Cepstral Coefficients.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Mel Frequency Cepstral Coefficients.jpg')
plt.show()
def mel_frequency_spectogram():
global file, wav_file, plot_path, new_dir
print("\nGenerated Mel Frequency Spectogram Plot...")
file_name = os.path.basename(wav_file)
y, sr = librosa.load(wav_file)
S = librosa.feature.melspectrogram(y=y, sr=sr)
S_dB = librosa.power_to_db(S, ref=np.max)
fig, ax = plt.subplots(figsize=(15, 3))
# fig, ax = plt.figure(figsize=(15, 3))
img = librosa.display.specshow(S_dB, sr=sr, x_axis='time')
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title=file_name + ' - Mel frequency spectrogram')
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - Mel frequency spectrogram.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Mel frequency spectrogram.jpg')
plt.show()
## function defined to get the visualization of dominancy of certain pitches{C, C♯, D, D♯, E, F, F♯, G, G♯, A, A♯, B} characteristics
'''
Chroma feature visualization is to know how dominant the characteristics of a certain pitch {C, C♯, D, D♯, E, F, F♯, G, G♯, A, A♯, B} is present in the sampled frame.
'''
def chroma_feature():
global file, wav_file, plot_path, new_dir
print("\nGenerated Chroma Feature Plot...")
file_name = os.path.basename(wav_file)
x, sr = librosa.load(wav_file)
hop_length = 512
chromagram = librosa.feature.chroma_stft(y=x, sr=sr, hop_length=hop_length)
plt.figure(figsize=(15, 5))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
plt.title(file_name + ' - Chroma Feature')
# plt.legend()
## saving the plot
print("Saved " + file_name + ' - Chroma Feature.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Chroma Feature.jpg')
plt.show()
## function defined to get the tempogram
'''
Tempo refers to the speed of an audio piece, which is usually measured in beats per minute (bpm) units.
Upbeat music like hip-hop, techno, or rock usually has a higher tempo compared to classical music, and
*** hence tempogram feature can be useful for music genre classification.
'''
def tempogram():
global file, wav_file, plot_path, new_dir
print("\nGenerated Tempogram Plot...")
file_name = os.path.basename(wav_file)
y, sr = librosa.load(wav_file)
hop_length = 512
# Compute local onset autocorrelation
oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
# Estimate the global tempo for display purposes
tempo = librosa.feature.rhythm.tempo(onset_envelope=oenv, sr=sr, hop_length=hop_length)[0]
fig, ax = plt.subplots(nrows=2, figsize=(15, 6))
ax[0].plot(times, oenv, label='Onset strength')
ax[0].label_outer()
ax[0].legend(frameon=True)
librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length, x_axis='time', y_axis='tempo', cmap='magma', ax=ax[1])
ax[1].axhline(tempo, color='w', linestyle='--', alpha=1, label='Estimated tempo={:g}'.format(tempo))
ax[1].legend(loc='upper right')
ax[1].set(title=file_name + ' - Tempogram')
plt.legend()
## saving the plot
print("Saved " + file_name + ' - Tempogram.jpg')
plt.savefig(str(new_dir) + "\\" + file_name + ' - Tempogram.jpg')
plt.show()
## function defined to get suggestion on volume
def suggest_volume():
global avg_rms
print("\nSuggestion on volumne : ", end = ' ')
if(avg_rms>0.00001):
print("High Volume")
else:
print("Low Volume")
## defining main function
def main():
global file
print("Select Audio file : ")
file = filedialog.askopenfilename(title="Select an Audio file", filetypes=[("Audio Files", "*.mp3"), ("All Files", "*.*")])
if (file != ""):
print(file)
mp3towav()
# playsound(file)
else:
print("No File Selected")
## calling main function
main()
extract()
play_pause_stop()
amplitude_wave()
spectogram()
rms_energy_spectogram()
zero_crossing_rate()
mel_frequency_cepstral_coefficients()
mel_frequency_spectogram()
chroma_feature()
tempogram()
suggest_volume()