Skip to content

Commit 0734b65

Browse files
authored
spectrogram and inverse spectrogram (Tencent#5779)
* only supports hann, hamming and all-one window * inverse spectrogram does not support length parameter * spectrogram always returns torch.view_as_real(out) as ncnn does not support complex typed mat yet * inverse spectrogram always accepts torch.view_as_complex(in) as ncnn does not support complex typed mat yet
1 parent c043612 commit 0734b65

33 files changed

+3155
-22
lines changed

.ci/pnnx.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,39 +31,51 @@ jobs:
3131
include:
3232
- torch-version: 1.8.1
3333
torchvision-version: 0.9.1
34+
torchaudio-version: 0.8.1
3435

3536
- torch-version: 1.9.1
3637
torchvision-version: 0.10.1
38+
torchaudio-version: 0.9.1
3739

3840
- torch-version: 1.10.0
3941
torchvision-version: 0.11.1
42+
torchaudio-version: '0.10.0+cpu'
4043

4144
- torch-version: 1.11.0
4245
torchvision-version: 0.12.0
46+
torchaudio-version: '0.11.0+cpu'
4347

4448
- torch-version: 1.12.0
4549
torchvision-version: 0.13.0
50+
torchaudio-version: '0.12.0+cpu'
4651

4752
- torch-version: 1.13.0
4853
torchvision-version: 0.14.0
54+
torchaudio-version: '0.13.0+cpu'
4955

5056
- torch-version: 2.0.0
5157
torchvision-version: 0.15.1
58+
torchaudio-version: '2.0.0+cpu'
5259

5360
- torch-version: 2.1.0
5461
torchvision-version: 0.16.0
62+
torchaudio-version: '2.1.0+cpu'
5563

5664
- torch-version: 2.2.1
5765
torchvision-version: 0.17.1
66+
torchaudio-version: '2.2.1+cpu'
5867

5968
- torch-version: 2.3.0
6069
torchvision-version: 0.18.0
70+
torchaudio-version: '2.3.0+cpu'
6171

6272
- torch-version: 2.4.0
6373
torchvision-version: 0.19.0
74+
torchaudio-version: '2.4.0+cpu'
6475

6576
- torch-version: 2.5.0
6677
torchvision-version: 0.20.0
78+
torchaudio-version: '2.5.0+cpu'
6779

6880
runs-on:
6981
pool-name: docker
@@ -169,7 +181,7 @@ jobs:
169181
- name: setup-pytorch
170182
run: |
171183
export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
172-
pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu --index-url https://download.pytorch.org/whl/cpu
184+
pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu torchaudio==${{matrix.torchaudio-version}} --index-url https://download.pytorch.org/whl/cpu
173185
pip3 install --user onnx
174186
pip3 install --user onnxscript
175187

docs/developer-guide/operators.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
* [Input](#input)
4747
* [InstanceNorm](#instancenorm)
4848
* [Interp](#interp)
49+
* [InverseSpectrogram](#inversespectrogram)
4950
* [LayerNorm](#layernorm)
5051
* [Log](#log)
5152
* [LRN](#lrn)
@@ -81,6 +82,7 @@
8182
* [Slice](#slice)
8283
* [Softmax](#softmax)
8384
* [Softplus](#softplus)
85+
* [Spectrogram](#spectrogram)
8486
* [Split](#split)
8587
* [Swish](#swish)
8688
* [TanH](#tanh)
@@ -1141,6 +1143,30 @@ Resize type:
11411143
- 2 = Bilinear
11421144
- 3 = Bicubic
11431145

1146+
# InverseSpectrogram
1147+
```
1148+
x1 = x as complex
1149+
x1 = x1 * sqrt(norm) if normalized
1150+
y = istft(x1)
1151+
y1 = unpad(y) if center
1152+
1153+
if returns == 0 return y1 as complex
1154+
if returns == 1 return y1 real
1155+
if returns == 2 return y1 imag
1156+
```
1157+
1158+
* one_blob_only
1159+
1160+
| param id | name | type | default | description |
1161+
| --------- | ------------- | ----- | --------- | ----------------- |
1162+
| 0 | n_fft | int | 0 | |
1163+
| 1 | returns | int | 1 | |
1164+
| 2 | hoplen | int | n_fft / 4 | |
1165+
| 3 | winlen | int | n_fft | |
1166+
| 4 | window_type | int | 0 | 0=ones 1=hann 2=hamming |
1167+
| 5 | center | int | 1 | |
1168+
| 7 | normalized | int | 0 | 0=no 1=n_fft 2=window-l2-energy |
1169+
11441170
# LayerNorm
11451171
```
11461172
split x along outmost axis into part x0, x1 ...
@@ -1829,6 +1855,31 @@ y = log(exp(x) + 1)
18291855
* one_blob_only
18301856
* support_inplace
18311857

1858+
# Spectrogram
1859+
```
1860+
x1 = pad(x) if center
1861+
y = stft(x1)
1862+
y = y / sqrt(norm) if normalized
1863+
1864+
if power == 0 return y as real
1865+
if power == 1 return magnitude
1866+
if power == 2 return square of magnitude
1867+
```
1868+
1869+
* one_blob_only
1870+
1871+
| param id | name | type | default | description |
1872+
| --------- | ------------- | ----- | --------- | ----------------- |
1873+
| 0 | n_fft | int | 0 | |
1874+
| 1 | power | int | 0 | |
1875+
| 2 | hoplen | int | n_fft / 4 | |
1876+
| 3 | winlen | int | n_fft | |
1877+
| 4 | window_type | int | 0 | 0=ones 1=hann 2=hamming |
1878+
| 5 | center | int | 1 | |
1879+
| 6 | pad_type | int | 2 | 0=CONSTANT 1=REPLICATE 2=REFLECT |
1880+
| 7 | normalized | int | 0 | 0=no 1=n_fft 2=window-l2-energy |
1881+
| 8 | onesided | int | 1 | |
1882+
18321883
# Split
18331884
```
18341885
y0, y1 ... = x

src/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ ncnn_add_layer(Diag)
167167
ncnn_add_layer(CELU)
168168
ncnn_add_layer(Shrink)
169169
ncnn_add_layer(RMSNorm)
170+
ncnn_add_layer(Spectrogram)
171+
ncnn_add_layer(InverseSpectrogram)
170172

171173
if(NCNN_VULKAN)
172174
ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)

src/layer/inversespectrogram.cpp

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
// Tencent is pleased to support the open source community by making ncnn available.
2+
//
3+
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
4+
//
5+
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6+
// in compliance with the License. You may obtain a copy of the License at
7+
//
8+
// https://opensource.org/licenses/BSD-3-Clause
9+
//
10+
// Unless required by applicable law or agreed to in writing, software distributed
11+
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
// specific language governing permissions and limitations under the License.
14+
15+
#include "inversespectrogram.h"
16+
17+
namespace ncnn {
18+
19+
InverseSpectrogram::InverseSpectrogram()
20+
{
21+
one_blob_only = true;
22+
support_inplace = false;
23+
}
24+
25+
int InverseSpectrogram::load_param(const ParamDict& pd)
26+
{
27+
n_fft = pd.get(0, 0);
28+
returns = pd.get(1, 0);
29+
hoplen = pd.get(2, n_fft / 4);
30+
winlen = pd.get(3, n_fft);
31+
window_type = pd.get(4, 0);
32+
center = pd.get(5, 1);
33+
normalized = pd.get(7, 0);
34+
35+
// assert winlen <= n_fft
36+
// generate window
37+
window_data.create(normalized == 2 ? n_fft + 1 : n_fft);
38+
{
39+
float* p = window_data;
40+
for (int i = 0; i < (n_fft - winlen) / 2; i++)
41+
{
42+
*p++ = 0.f;
43+
}
44+
if (window_type == 0)
45+
{
46+
// all ones
47+
for (int i = 0; i < winlen; i++)
48+
{
49+
*p++ = 1.f;
50+
}
51+
}
52+
if (window_type == 1)
53+
{
54+
// hann window
55+
for (int i = 0; i < winlen; i++)
56+
{
57+
*p++ = 0.5f * (1 - cosf(2 * 3.14159265358979323846 * i / winlen));
58+
}
59+
}
60+
if (window_type == 2)
61+
{
62+
// hamming window
63+
for (int i = 0; i < winlen; i++)
64+
{
65+
*p++ = 0.54f - 0.46f * cosf(2 * 3.14159265358979323846 * i / winlen);
66+
}
67+
}
68+
for (int i = 0; i < n_fft - winlen - (n_fft - winlen) / 2; i++)
69+
{
70+
*p++ = 0.f;
71+
}
72+
73+
// pre-calculated window norm factor
74+
if (normalized == 2)
75+
{
76+
float sqsum = 0.f;
77+
for (int i = 0; i < n_fft; i++)
78+
{
79+
sqsum += window_data[i] * window_data[i];
80+
}
81+
window_data[n_fft] = sqrt(sqsum);
82+
}
83+
}
84+
85+
return 0;
86+
}
87+
88+
int InverseSpectrogram::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
89+
{
90+
// https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L630
91+
92+
// TODO custom window
93+
// TODO output length
94+
95+
const int frames = bottom_blob.h;
96+
const int freqs = bottom_blob.c;
97+
// assert freqs == n_fft or freqs == n_fft / 2 + 1
98+
99+
const int onesided = freqs == n_fft / 2 + 1 ? 1 : 0;
100+
101+
const int outsize = center ? (frames - 1) * hoplen + (n_fft - n_fft / 2 * 2) : (frames - 1) * hoplen + n_fft;
102+
103+
const size_t elemsize = bottom_blob.elemsize;
104+
105+
if (returns == 0)
106+
{
107+
top_blob.create(2, outsize, elemsize, opt.blob_allocator);
108+
}
109+
else
110+
{
111+
top_blob.create(outsize, elemsize, opt.blob_allocator);
112+
}
113+
if (top_blob.empty())
114+
return -100;
115+
116+
Mat window_sumsquare(outsize + n_fft, elemsize, opt.workspace_allocator);
117+
if (window_sumsquare.empty())
118+
return -100;
119+
120+
top_blob.fill(0.f);
121+
window_sumsquare.fill(0.f);
122+
123+
for (int j = 0; j < frames; j++)
124+
{
125+
// collect complex
126+
Mat sp(2, n_fft);
127+
if (onesided == 1)
128+
{
129+
for (int k = 0; k < n_fft / 2 + 1; k++)
130+
{
131+
sp.row(k)[0] = bottom_blob.channel(k).row(j)[0];
132+
sp.row(k)[1] = bottom_blob.channel(k).row(j)[1];
133+
}
134+
for (int k = n_fft / 2 + 1; k < n_fft; k++)
135+
{
136+
sp.row(k)[0] = bottom_blob.channel(n_fft - k).row(j)[0];
137+
sp.row(k)[1] = -bottom_blob.channel(n_fft - k).row(j)[1];
138+
}
139+
}
140+
else
141+
{
142+
for (int k = 0; k < n_fft; k++)
143+
{
144+
sp.row(k)[0] = bottom_blob.channel(k).row(j)[0];
145+
sp.row(k)[1] = bottom_blob.channel(k).row(j)[1];
146+
}
147+
}
148+
149+
if (normalized == 1)
150+
{
151+
float norm = sqrt(n_fft);
152+
for (int i = 0; i < 2 * n_fft; i++)
153+
{
154+
sp[i] *= norm;
155+
}
156+
}
157+
if (normalized == 2)
158+
{
159+
float norm = window_data[n_fft];
160+
for (int i = 0; i < 2 * n_fft; i++)
161+
{
162+
sp[i] *= norm;
163+
}
164+
}
165+
166+
#pragma omp parallel for num_threads(opt.num_threads)
167+
for (int i = 0; i < n_fft; i++)
168+
{
169+
// inverse dft
170+
float re = 0.f;
171+
float im = 0.f;
172+
for (int k = 0; k < n_fft; k++)
173+
{
174+
double angle = 2 * 3.14159265358979323846 * i * k / n_fft;
175+
176+
re += sp.row(k)[0] * cosf(angle) - sp.row(k)[1] * sinf(angle);
177+
im += sp.row(k)[0] * sinf(angle) + sp.row(k)[1] * cosf(angle);
178+
}
179+
180+
re /= n_fft;
181+
im /= n_fft;
182+
183+
// apply window
184+
re *= window_data[i];
185+
im *= window_data[i];
186+
187+
int output_index = j * hoplen + i;
188+
if (center == 1)
189+
{
190+
output_index -= n_fft / 2;
191+
}
192+
if (output_index >= 0 && output_index < outsize)
193+
{
194+
// square window
195+
window_sumsquare[output_index] += window_data[i] * window_data[i];
196+
197+
if (returns == 0)
198+
{
199+
top_blob.row(output_index)[0] += re;
200+
top_blob.row(output_index)[1] += im;
201+
}
202+
if (returns == 1)
203+
{
204+
top_blob[output_index] += re;
205+
}
206+
if (returns == 2)
207+
{
208+
top_blob[output_index] += im;
209+
}
210+
}
211+
}
212+
}
213+
214+
// square window norm
215+
if (returns == 0)
216+
{
217+
for (int i = 0; i < outsize; i++)
218+
{
219+
if (window_sumsquare[i] != 0.f)
220+
{
221+
top_blob.row(i)[0] /= window_sumsquare[i];
222+
top_blob.row(i)[1] /= window_sumsquare[i];
223+
}
224+
}
225+
}
226+
else
227+
{
228+
for (int i = 0; i < outsize; i++)
229+
{
230+
if (window_sumsquare[i] != 0.f)
231+
top_blob[i] /= window_sumsquare[i];
232+
}
233+
}
234+
235+
return 0;
236+
}
237+
238+
} // namespace ncnn

0 commit comments

Comments
 (0)