Skip to content

Commit 0b3b574

Browse files
andrula-songlgirdwood
authored andcommitted
Audio: Mixin_mixout: Add HiFi5 implementation.
Add HiFi5 implementation of mix functions, compared with HiFi3 version, can reduce about 27% cycles. Signed-off-by: Andrula Song <[email protected]>
1 parent ab87904 commit 0b3b574

7 files changed

+326
-15
lines changed

src/audio/mixin_mixout/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c)
1+
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c mixin_mixout_hifi5.c)

src/audio/mixin_mixout/Kconfig

+37
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,40 @@ config COMP_MIXIN_MIXOUT
66
default y
77
help
88
Select for Mixin_mixout component
9+
10+
choice "MIXIN_MIXOUT_SIMD_LEVEL_SELECT"
11+
prompt "choose which SIMD level used for MIXIN_MIXOUT module"
12+
depends on COMP_MIXIN_MIXOUT
13+
default MIXIN_MIXOUT_HIFI_MAX
14+
15+
config MIXIN_MIXOUT_HIFI_MAX
16+
prompt "Max HiFi level available in the toolchain"
17+
bool
18+
help
19+
When this was selected, optimization level will be determined
20+
by toolchain.
21+
22+
config MIXIN_MIXOUT_HIFI_5
23+
prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module"
24+
bool
25+
help
26+
This option used to build HIFI4 optimized MIXIN_MIXOUT code
27+
28+
config MIXIN_MIXOUT_HIFI_4
29+
prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module"
30+
bool
31+
help
32+
This option used to build HIFI4 optimized MIXIN_MIXOUT code
33+
34+
config MIXIN_MIXOUT_HIFI_3
35+
prompt "choose HIFI3 intrinsic optimized MIXIN_MIXOUT module"
36+
bool
37+
help
38+
This option used to build HIFI3 intrinsic optimized MIXIN_MIXOUT code
39+
40+
config MIXIN_MIXOUT_HIFI_NONE
41+
prompt "choose generic C MIXIN_MIXOUT module, no HIFI SIMD involved"
42+
bool
43+
help
44+
This option used to build MIXIN_MIXOUT generic code.
45+
endchoice

src/audio/mixin_mixout/mixin_mixout.h

-12
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,6 @@
3131
#include <sof/platform.h>
3232
#include <stddef.h>
3333

34-
#define MIXIN_MIXOUT_GENERIC
35-
36-
#if defined(__XCC__)
37-
38-
#include <xtensa/config/core-isa.h>
39-
#if XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4
40-
#undef MIXIN_MIXOUT_GENERIC
41-
#define MIXIN_MIXOUT_HIFI3
42-
#endif
43-
44-
#endif
45-
4634
enum ipc4_mixin_config_param {
4735
/* large_config_set param id for ipc4_mixer_mode_config */
4836
IPC4_MIXER_MODE = 1

src/audio/mixin_mixout/mixin_mixout_generic.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
#include "mixin_mixout.h"
1111

12-
#ifdef MIXIN_MIXOUT_GENERIC
12+
#if SOF_USE_HIFI(NONE, MIXIN_MIXOUT)
1313

1414
#if CONFIG_FORMAT_S16LE
1515
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,

src/audio/mixin_mixout/mixin_mixout_hifi3.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#include "mixin_mixout.h"
1010

11-
#ifdef MIXIN_MIXOUT_HIFI3
11+
#if SOF_USE_HIFI(3, MIXIN_MIXOUT) || SOF_USE_HIFI(4, MIXIN_MIXOUT)
1212

1313
#if CONFIG_FORMAT_S16LE
1414
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
+285
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
// SPDX-License-Identifier: BSD-3-Clause
2+
//
3+
// Copyright(c) 2024 Intel Corporation. All rights reserved.
4+
//
5+
// Author: Andrula Song <[email protected]>
6+
7+
#include <sof/common.h>
8+
9+
#include "mixin_mixout.h"
10+
11+
#if SOF_USE_HIFI(5, MIXIN_MIXOUT)
12+
13+
#if CONFIG_FORMAT_S16LE
14+
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
15+
const struct cir_buf_ptr *source,
16+
int32_t sample_count, uint16_t gain)
17+
{
18+
int samples_to_mix, samples_to_copy, left_samples;
19+
int n, nmax, i, m, left;
20+
ae_int16x4 in_sample, in_sample1;
21+
ae_int16x4 out_sample, out_sample1;
22+
ae_int16x8 *in;
23+
ae_int16x8 *out;
24+
ae_valignx2 inu = AE_ZALIGN128();
25+
ae_valignx2 outu1 = AE_ZALIGN128();
26+
ae_valignx2 outu2 = AE_ZALIGN128();
27+
/* cir_buf_wrap() is required and is done below in a loop */
28+
ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample;
29+
ae_int16 *src = source->ptr;
30+
31+
assert(mixed_samples >= start_sample);
32+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
33+
samples_to_copy = sample_count - samples_to_mix;
34+
n = 0;
35+
36+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
37+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
38+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
39+
/* calculate the remaining samples*/
40+
nmax = (ae_int16 *)source->buf_end - src;
41+
n = AE_MIN32(left_samples, nmax);
42+
nmax = (ae_int16 *)sink->buf_end - dst;
43+
n = AE_MIN32(n, nmax);
44+
in = (ae_int16x8 *)src;
45+
out = (ae_int16x8 *)dst;
46+
inu = AE_LA128_PP(in);
47+
outu1 = AE_LA128_PP(out);
48+
m = n >> 3;
49+
left = n & 0x07;
50+
/* process 8 samples per loop */
51+
for (i = 0; i < m; i++) {
52+
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
53+
AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out);
54+
out--;
55+
out_sample = AE_ADD16S(in_sample, out_sample);
56+
out_sample1 = AE_ADD16S(in_sample1, out_sample1);
57+
AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out);
58+
}
59+
AE_SA128POS_FP(outu2, out);
60+
61+
/* process the left samples that less than 8
62+
* one by one to avoid memory access overrun
63+
*/
64+
for (i = 0; i < left ; i++) {
65+
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
66+
AE_L16_IP(out_sample, (ae_int16 *)out, 0);
67+
out_sample = AE_ADD16S(in_sample, out_sample);
68+
AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16));
69+
}
70+
}
71+
72+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
73+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
74+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
75+
/* calculate the remaining samples*/
76+
nmax = (ae_int16 *)source->buf_end - src;
77+
n = AE_MIN32(left_samples, nmax);
78+
nmax = (ae_int16 *)sink->buf_end - dst;
79+
n = AE_MIN32(n, nmax);
80+
in = (ae_int16x8 *)src;
81+
out = (ae_int16x8 *)dst;
82+
inu = AE_LA128_PP(in);
83+
m = n >> 3;
84+
left = n & 0x07;
85+
/* process 8 frames per loop */
86+
for (i = 0; i < m; i++) {
87+
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
88+
AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out);
89+
}
90+
AE_SA128POS_FP(outu2, out);
91+
92+
/* process the left samples that less than 8
93+
* one by one to avoid memory access overrun
94+
*/
95+
for (i = 0; i < left ; i++) {
96+
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
97+
AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16));
98+
}
99+
}
100+
}
101+
#endif /* CONFIG_FORMAT_S16LE */
102+
103+
#if CONFIG_FORMAT_S24LE
104+
static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
105+
const struct cir_buf_ptr *source,
106+
int32_t sample_count, uint16_t gain)
107+
{
108+
int samples_to_mix, samples_to_copy, left_samples;
109+
int n, nmax, i, m, left;
110+
ae_int32x2 in_sample, in_sample1;
111+
ae_int32x2 out_sample, out_sample1;
112+
ae_int32x4 *in;
113+
ae_int32x4 *out;
114+
ae_valignx2 inu = AE_ZALIGN128();
115+
ae_valignx2 outu1 = AE_ZALIGN128();
116+
ae_valignx2 outu2 = AE_ZALIGN128();
117+
/* cir_buf_wrap() is required and is done below in a loop */
118+
int32_t *dst = (int32_t *)sink->ptr + start_sample;
119+
int32_t *src = source->ptr;
120+
121+
assert(mixed_samples >= start_sample);
122+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
123+
samples_to_copy = sample_count - samples_to_mix;
124+
n = 0;
125+
126+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
127+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
128+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
129+
/* calculate the remaining samples*/
130+
nmax = (int32_t *)source->buf_end - src;
131+
n = AE_MIN32(left_samples, nmax);
132+
nmax = (int32_t *)sink->buf_end - dst;
133+
n = AE_MIN32(n, nmax);
134+
in = (ae_int32x4 *)src;
135+
out = (ae_int32x4 *)dst;
136+
inu = AE_LA128_PP(in);
137+
outu1 = AE_LA128_PP(out);
138+
m = n >> 2;
139+
left = n & 3;
140+
/* process 2 samples per time */
141+
for (i = 0; i < m; i++) {
142+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
143+
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
144+
out--;
145+
out_sample = AE_ADD24S(in_sample, out_sample);
146+
out_sample1 = AE_ADD24S(in_sample1, out_sample1);
147+
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
148+
}
149+
AE_SA128POS_FP(outu2, out);
150+
151+
/* process the left sample to avoid memory access overrun */
152+
if (left) {
153+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
154+
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
155+
out_sample = AE_ADD24S(in_sample, out_sample);
156+
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
157+
}
158+
}
159+
160+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
161+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
162+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
163+
nmax = (int32_t *)source->buf_end - src;
164+
n = AE_MIN32(left_samples, nmax);
165+
nmax = (int32_t *)sink->buf_end - dst;
166+
n = AE_MIN32(n, nmax);
167+
in = (ae_int32x4 *)src;
168+
out = (ae_int32x4 *)dst;
169+
inu = AE_LA128_PP(in);
170+
m = n >> 2;
171+
left = n & 3;
172+
for (i = 0; i < m; i++) {
173+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
174+
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
175+
}
176+
AE_SA128POS_FP(outu2, out);
177+
/* process the left sample to avoid memory access overrun */
178+
if (left) {
179+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
180+
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
181+
}
182+
}
183+
}
184+
185+
#endif /* CONFIG_FORMAT_S24LE */
186+
187+
#if CONFIG_FORMAT_S32LE
188+
static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
189+
const struct cir_buf_ptr *source,
190+
int32_t sample_count, uint16_t gain)
191+
{
192+
int samples_to_mix, samples_to_copy, left_samples;
193+
int n, nmax, i, m, left;
194+
ae_int32x2 in_sample, in_sample1;
195+
ae_int32x2 out_sample, out_sample1;
196+
ae_int32x4 *in;
197+
ae_int32x4 *out;
198+
ae_valignx2 inu = AE_ZALIGN128();
199+
ae_valignx2 outu1 = AE_ZALIGN128();
200+
ae_valignx2 outu2 = AE_ZALIGN128();
201+
/* cir_buf_wrap() is required and is done below in a loop */
202+
int32_t *dst = (int32_t *)sink->ptr + start_sample;
203+
int32_t *src = source->ptr;
204+
205+
assert(mixed_samples >= start_sample);
206+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
207+
samples_to_copy = sample_count - samples_to_mix;
208+
n = 0;
209+
210+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
211+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
212+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
213+
/* calculate the remaining samples*/
214+
nmax = (int32_t *)source->buf_end - src;
215+
n = AE_MIN32(left_samples, nmax);
216+
nmax = (int32_t *)sink->buf_end - dst;
217+
n = AE_MIN32(n, nmax);
218+
in = (ae_int32x4 *)src;
219+
out = (ae_int32x4 *)dst;
220+
inu = AE_LA128_PP(in);
221+
outu1 = AE_LA128_PP(out);
222+
m = n >> 2;
223+
left = n & 3;
224+
for (i = 0; i < m; i++) {
225+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
226+
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
227+
out--;
228+
out_sample = AE_ADD32S(in_sample, out_sample);
229+
out_sample1 = AE_ADD32S(in_sample1, out_sample1);
230+
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
231+
}
232+
AE_SA128POS_FP(outu2, out);
233+
234+
/* process the left sample to avoid memory access overrun */
235+
if (left) {
236+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
237+
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
238+
out_sample = AE_ADD32S(in_sample, out_sample);
239+
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
240+
}
241+
}
242+
243+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
244+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
245+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
246+
/* calculate the remaining samples*/
247+
nmax = (int32_t *)source->buf_end - src;
248+
n = AE_MIN32(left_samples, nmax);
249+
nmax = (int32_t *)sink->buf_end - dst;
250+
n = AE_MIN32(n, nmax);
251+
in = (ae_int32x4 *)src;
252+
out = (ae_int32x4 *)dst;
253+
inu = AE_LA128_PP(in);
254+
m = n >> 2;
255+
left = n & 3;
256+
for (i = 0; i < m; i++) {
257+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
258+
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
259+
}
260+
AE_SA128POS_FP(outu2, out);
261+
/* process the left sample to avoid memory access overrun */
262+
if (left) {
263+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
264+
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
265+
}
266+
}
267+
}
268+
269+
#endif /* CONFIG_FORMAT_S32LE */
270+
271+
const struct mix_func_map mix_func_map[] = {
272+
#if CONFIG_FORMAT_S16LE
273+
{ SOF_IPC_FRAME_S16_LE, mix_s16 },
274+
#endif
275+
#if CONFIG_FORMAT_S24LE
276+
{ SOF_IPC_FRAME_S24_4LE, mix_s24 },
277+
#endif
278+
#if CONFIG_FORMAT_S32LE
279+
{ SOF_IPC_FRAME_S32_LE, mix_s32 }
280+
#endif
281+
};
282+
283+
const size_t mix_count = ARRAY_SIZE(mix_func_map);
284+
285+
#endif

zephyr/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,7 @@ zephyr_library_sources_ifdef(CONFIG_COMP_MIXIN_MIXOUT
561561
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout.c
562562
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_generic.c
563563
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi3.c
564+
${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi5.c
564565
)
565566

566567
zephyr_library_sources_ifdef(CONFIG_COMP_TONE

0 commit comments

Comments
 (0)