Discussion:
[PATCH 4/6] truehd: tune VLC decoding for ARM.
(too old to reply)
Ben Avison
2014-03-19 17:24:25 UTC
Permalink
Profiling on a Raspberry Pi revealed the best performance to correspond
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
---
libavcodec/mlpdec.c | 13 ++++++++++---
1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index c0f2d6a..b9d1704 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -37,9 +37,16 @@
#include "mlp_parser.h"
#include "mlpdsp.h"
#include "mlp.h"
+#include "config.h"

/** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
+#define VLC_BITS 5
+#define VLC_STATIC_SIZE 64
+#else
#define VLC_BITS 9
+#define VLC_STATIC_SIZE 512
+#endif

typedef struct SubStream {
/// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
@@ -190,13 +197,13 @@ static av_cold void init_static(void)
if (!huff_vlc[0].bits) {
INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
&ff_mlp_huffman_tables[0][0][1], 2, 1,
- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
&ff_mlp_huffman_tables[1][0][1], 2, 1,
- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
&ff_mlp_huffman_tables[2][0][1], 2, 1,
- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
}

ff_mlp_init_crc();
--
1.7.5.4
Ben Avison
2014-03-19 17:24:24 UTC
Permalink
Profiling results for overall audio decode and the rematrix_channels function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3%
6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant)
8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant)
8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant)
6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9%
6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3%
8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9%
8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/mlpdsp_arm.S | 230 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 12 ++
2 files changed, 242 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
index a94f45e..10008fe 100644
--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -429,3 +429,233 @@ endfunc
.unreq I
.unreq PSAMP

+/********************************************************************/
+
+PSA .req a1 // samples
+PCO .req a2 // coeffs
+PBL .req a3 // bypassed_lsbs
+INDEX .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+SA0 .req v5
+SA1 .req v6
+SA2 .req sl
+SA3 .req fp
+AC0 .req ip
+AC1 .req lr
+NOISE .req SA0
+LSB .req SA1
+DCH .req SA2 // dest_ch
+MASK .req SA3
+
+ // INDEX is used as follows:
+ // bits 0..6 index2 (values up to 17, but wider so that we can
+ // add to index field without needing to mask)
+ // bits 7..14 i (values up to 160)
+ // bit 15 underflow detect for i
+ // bits 25..31 (if access_unit_size_pow2 == 128) \ index
+ // bits 26..31 (if access_unit_size_pow2 == 64) /
+
+.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
+ .if \maxchan == 1
+ // We can just leave the coefficients in registers in this case
+ ldrd CO0, CO1, [PCO]
+ .endif
+1:
+ .if \maxchan == 1
+ ldrd SA0, SA1, [PSA]
+ smull AC0, AC1, CO0, SA0
+ .elseif \maxchan == 5
+ ldr CO0, [PCO, #0]
+ ldr SA0, [PSA, #0]
+ ldr CO1, [PCO, #4]
+ ldr SA1, [PSA, #4]
+ ldrd CO2, CO3, [PCO, #8]
+ smull AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #8]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #16]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #16]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .else // \maxchan == 7
+ ldr CO2, [PCO, #0]
+ ldr SA2, [PSA, #0]
+ ldr CO3, [PCO, #4]
+ ldr SA3, [PSA, #4]
+ ldrd CO0, CO1, [PCO, #8]
+ smull AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #8]
+ smlal AC0, AC1, CO3, SA3
+ ldrd CO2, CO3, [PCO, #16]
+ smlal AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #16]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #24]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #24]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .endif
+ ldm sp, {NOISE, DCH, MASK}
+ smlal AC0, AC1, CO1, SA1
+ .if \shift != 0
+ .if \index_mask == 63
+ add NOISE, NOISE, INDEX, lsr #32-6
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-6
+ .else // \index_mask == 127
+ add NOISE, NOISE, INDEX, lsr #32-7
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-7
+ .endif
+ sub INDEX, INDEX, #1<<7
+ adds AC0, AC0, NOISE, lsl #\shift + 7
+ adc AC1, AC1, NOISE, asr #31
+ .else
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ sub INDEX, INDEX, #1<<7
+ .endif
+ add PSA, PSA, #MAX_CHANNELS*4
+ mov AC0, AC0, lsr #14
+ orr AC0, AC0, AC1, lsl #18
+ .if !\mask_minus1
+ and AC0, AC0, MASK
+ .endif
+ add AC0, AC0, LSB
+ tst INDEX, #1<<15
+ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
+ beq 1b
+ b 98f
+.endm
+
+.macro switch_on_maxchan shift, index_mask, mask_minus1
+ cmp v4, #5
+ blo 51f
+ beq 50f
+ implement_rematrix \shift, \index_mask, \mask_minus1, 7
+50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
+51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
+.endm
+
+.macro switch_on_mask shift, index_mask
+ cmp sl, #-1
+ bne 40f
+ switch_on_maxchan \shift, \index_mask, 1
+40: switch_on_maxchan \shift, \index_mask, 0
+.endm
+
+.macro switch_on_au_size shift
+ .if \shift == 0
+ switch_on_mask \shift, undefined
+ .else
+ teq v6, #64
+ bne 30f
+ orr INDEX, INDEX, v1, lsl #32-6
+ switch_on_mask \shift, 63
+30: orr INDEX, INDEX, v1, lsl #32-7
+ switch_on_mask \shift, 127
+ .endif
+.endm
+
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ * const int32_t *coeffs,
+ * const uint8_t *bypassed_lsbs,
+ * const int8_t *noise_buffer,
+ * int index,
+ * unsigned int dest_ch,
+ * uint16_t blockpos,
+ * unsigned int maxchan,
+ * int matrix_noise_shift,
+ * int access_unit_size_pow2,
+ * int32_t mask);
+ */
+function ff_mlp_rematrix_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {v1-sl}
+ teq v4, #1
+ teqne v4, #5
+ teqne v4, #7
+ bne 99f
+ teq v6, #64
+ teqne v6, #128
+ bne 99f
+ sub v2, v2, #MAX_CHANNELS
+ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
+ movs INDEX, v3, lsl #7
+ beq 98f // just in case, do nothing if blockpos = 0
+ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
+ adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
+ orr INDEX, INDEX, lr
+ // Switch on matrix_noise_shift: values 0 and 1 are
+ // disproportionately common so do those in a form the branch
+ // predictor can accelerate. Values can only go up to 15.
+ cmp v5, #1
+ beq 11f
+ blo 10f
+ ldr pc, [pc, v5, lsl #2]
+ .word 0
+ .word 0
+ .word 0
+ .word 12f
+ .word 13f
+ .word 14f
+ .word 15f
+ .word 16f
+ .word 17f
+ .word 18f
+ .word 19f
+ .word 20f
+ .word 21f
+ .word 22f
+ .word 23f
+ .word 24f
+ .word 25f
+10: switch_on_au_size 0
+11: switch_on_au_size 1
+12: switch_on_au_size 2
+13: switch_on_au_size 3
+14: switch_on_au_size 4
+15: switch_on_au_size 5
+16: switch_on_au_size 6
+17: switch_on_au_size 7
+18: switch_on_au_size 8
+19: switch_on_au_size 9
+20: switch_on_au_size 10
+21: switch_on_au_size 11
+22: switch_on_au_size 12
+23: switch_on_au_size 13
+24: switch_on_au_size 14
+25: switch_on_au_size 15
+
+98: add sp, sp, #3*4
+ pop {v1-fp,pc}
+99: // Can't handle these parameters, drop back to C
+ pop {v1-fp,lr}
+ b X(ff_mlp_rematrix_channel)
+endfunc
+
+ .unreq PSA
+ .unreq PCO
+ .unreq PBL
+ .unreq INDEX
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq SA0
+ .unreq SA1
+ .unreq SA2
+ .unreq SA3
+ .unreq AC0
+ .unreq AC1
+ .unreq NOISE
+ .unreq LSB
+ .unreq DCH
+ .unreq MASK
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index f0ea285..268dfdd 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);

av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
}
--
1.7.5.4
Ben Avison
2014-03-19 17:24:23 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index ed5a6ac..c0f2d6a 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -978,7 +978,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
{
SubStream *s = &m->substream[substr];
- unsigned int mat, src_ch, i;
+ unsigned int mat;
unsigned int maxchan;

maxchan = s->max_matrix_channel;
@@ -990,31 +990,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
}

for (mat = 0; mat < s->num_primitive_matrices; mat++) {
- int matrix_noise_shift = s->matrix_noise_shift[mat];
unsigned int dest_ch = s->matrix_out_ch[mat];
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
- int32_t *coeffs = s->matrix_coeff[mat];
- int index = s->num_primitive_matrices - mat;
- int index2 = 2 * index + 1;
-
- /* TODO: DSPContext? */
-
- for (i = 0; i < s->blockpos; i++) {
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
- int32_t *samples = m->sample_buffer[i];
- int64_t accum = 0;
-
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
-
- if (matrix_noise_shift) {
- index &= m->access_unit_size_pow2 - 1;
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
- index += index2;
- }
-
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
- }
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+ s->matrix_coeff[mat],
+ &m->bypassed_lsbs[0][mat],
+ m->noise_buffer,
+ s->num_primitive_matrices - mat,
+ dest_ch,
+ s->blockpos,
+ maxchan,
+ s->matrix_noise_shift[mat],
+ m->access_unit_size_pow2,
+ MSB_MASK(s->quant_step_size[dest_ch]));
}
}

diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 151cf83..dfa13af 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask)
+{
+ unsigned int src_ch, i;
+ int index2 = 2 * index + 1;
+ for (i = 0; i < blockpos; i++) {
+ int64_t accum = 0;
+
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+ if (matrix_noise_shift) {
+ index &= access_unit_size_pow2 - 1;
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
+ index += index2;
+ }
+
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+ bypassed_lsbs += MAX_CHANNELS;
+ samples += MAX_CHANNELS;
+ }
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@

#include <stdint.h>

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Diego Biurrun
2014-03-19 17:38:41 UTC
Permalink
Post by Ben Avison
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
+void ff_mlp_rematrix_channel(int32_t *samples,
This is not used outside of the file, so it should be a static function
and not have a ff_ prefix.

Diego
Luca Barbato
2014-03-19 17:49:52 UTC
Permalink
Post by Ben Avison
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index ed5a6ac..c0f2d6a 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -978,7 +978,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
{
SubStream *s = &m->substream[substr];
- unsigned int mat, src_ch, i;
+ unsigned int mat;
unsigned int maxchan;
maxchan = s->max_matrix_channel;
@@ -990,31 +990,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
}
for (mat = 0; mat < s->num_primitive_matrices; mat++) {
- int matrix_noise_shift = s->matrix_noise_shift[mat];
unsigned int dest_ch = s->matrix_out_ch[mat];
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
- int32_t *coeffs = s->matrix_coeff[mat];
- int index = s->num_primitive_matrices - mat;
- int index2 = 2 * index + 1;
-
- /* TODO: DSPContext? */
-
- for (i = 0; i < s->blockpos; i++) {
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
- int32_t *samples = m->sample_buffer[i];
- int64_t accum = 0;
-
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
-
- if (matrix_noise_shift) {
- index &= m->access_unit_size_pow2 - 1;
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
- index += index2;
- }
-
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
- }
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+ s->matrix_coeff[mat],
+ &m->bypassed_lsbs[0][mat],
+ m->noise_buffer,
+ s->num_primitive_matrices - mat,
+ dest_ch,
+ s->blockpos,
+ maxchan,
+ s->matrix_noise_shift[mat],
+ m->access_unit_size_pow2,
+ MSB_MASK(s->quant_step_size[dest_ch]));
}
}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 151cf83..dfa13af 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask)
+{
+ unsigned int src_ch, i;
+ int index2 = 2 * index + 1;
+ for (i = 0; i < blockpos; i++) {
+ int64_t accum = 0;
+
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+ if (matrix_noise_shift) {
+ index &= access_unit_size_pow2 - 1;
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
+ index += index2;
+ }
+
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+ bypassed_lsbs += MAX_CHANNELS;
+ samples += MAX_CHANNELS;
+ }
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@
#include <stdint.h>
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
Could you please try to have the function accept the m and s context
instead of having all those parameters? Does it work equally well?
Post by Ben Avison
} MLPDSPContext;
void ff_mlpdsp_init(MLPDSPContext *c);
Martin Storsjö
2014-03-19 17:56:48 UTC
Permalink
Post by Luca Barbato
Post by Ben Avison
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@
#include <stdint.h>
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
Could you please try to have the function accept the m and s context
instead of having all those parameters? Does it work equally well?
That requires hand-coding (and manually updating) the struct element
offsets (see libavcodec/arm/asm-offsets.h and mpegvideo_arm.c) and IIRC we
generally tend to move away from it.

// Martin
Luca Barbato
2014-03-19 19:10:51 UTC
Permalink
Post by Martin Storsjö
Post by Luca Barbato
Post by Ben Avison
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@
#include <stdint.h>
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
Could you please try to have the function accept the m and s context
instead of having all those parameters? Does it work equally well?
That requires hand-coding (and manually updating) the struct element
offsets (see libavcodec/arm/asm-offsets.h and mpegvideo_arm.c) and IIRC
we generally tend to move away from it.
Fine as it is then.

lu
Ben Avison
2014-03-19 17:24:22 UTC
Permalink
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
function in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%

Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 4 +
libavcodec/arm/mlpdsp_arm.S | 431 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 ++++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 474 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8bdccbd..e6eb0b6 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,9 +21,13 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
+OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..a94f45e
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
+#undef CONFIG_THUMB
+#define CONFIG_THUMB 0
+
+#include "libavutil/arm/asm.S"
+
+#define MAX_CHANNELS 8
+#define MAX_FIR_ORDER 8
+#define MAX_IIR_ORDER 4
+#define MAX_RATEFACTOR 4
+#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
+
+PST .req a1
+PCO .req a2
+AC0 .req a3
+AC1 .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+ST0 .req v5
+ST1 .req v6
+ST2 .req sl
+ST3 .req fp
+I .req ip
+PSAMP .req lr
+
+
+// Some macros that do loads/multiplies where the register number is determined
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
+
+.macro load group, index, base, offset
+ .altmacro
+ load_ \group, %(\index), \base, \offset
+ .noaltmacro
+.endm
+
+.macro load_ group, index, base, offset
+ ldr \group\index, [\base, #\offset]
+.endm
+
+.macro loadd group, index, base, offset
+ .altmacro
+ loadd_ \group, %(\index), %(\index+1), \base, \offset
+ .noaltmacro
+.endm
+
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
+
+.macro multiply index, accumulate, long
+ .altmacro
+ multiply_ %(\index), \accumulate, \long
+ .noaltmacro
+.endm
+
+.macro multiply_ index, accumulate, long
+ .if \long
+ .if \accumulate
+ smlal AC0, AC1, CO\index, ST\index
+ .else
+ smull AC0, AC1, CO\index, ST\index
+ .endif
+ .else
+ .if \accumulate
+ mla AC0, CO\index, ST\index, AC0
+ .else
+ mul AC0, CO\index, ST\index
+ .endif
+ .endif
+.endm
+
+// A macro to update the load register number and load offsets
+
+.macro inc howmany
+ .set LOAD_REG, (LOAD_REG + \howmany) & 3
+ .set OFFSET_CO, OFFSET_CO + 4 * \howmany
+ .set OFFSET_ST, OFFSET_ST + 4 * \howmany
+ .if FIR_REMAIN > 0
+ .set FIR_REMAIN, FIR_REMAIN - \howmany
+ .if FIR_REMAIN == 0
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .endif
+ .elseif IIR_REMAIN > 0
+ .set IIR_REMAIN, IIR_REMAIN - \howmany
+ .endif
+.endm
+
+// Macro to implement the inner loop for one specific combination of parameters
+
+.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
+ .set TOTAL_TAPS, \iir_taps + \fir_taps
+
+ // Deal with register allocation...
+ .set DEFINED_SHIFT, 0
+ .set DEFINED_MASK, 0
+ .set SHUFFLE_SHIFT, 0
+ .set SHUFFLE_MASK, 0
+ .set SPILL_SHIFT, 0
+ .set SPILL_MASK, 0
+ .if TOTAL_TAPS == 0
+ // Little register pressure in this case - just keep MASK where it was
+ .if !\mask_minus1
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+ .else
+ .if \shift_0
+ .if !\mask_minus1
+ // AC1 is unused with shift 0
+ MASK .req AC1
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif \shift_8
+ .if !\mask_minus1
+ .if TOTAL_TAPS <= 4
+ // All coefficients are preloaded (so pointer not needed)
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .else
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .else // shift not 0 or 8
+ .if TOTAL_TAPS <= 3
+ // All coefficients are preloaded, and at least one CO register is unused
+ .if \fir_taps & 1
+ SHIFT .req CO0
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .else
+ SHIFT .req CO3
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .endif
+ .if !\mask_minus1
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif TOTAL_TAPS == 4
+ // All coefficients are preloaded
+ SHIFT .req PCO
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .else
+ .set SPILL_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .endif
+ .endif
+ .if SPILL_SHIFT
+ SHIFT .req ST0
+ .set DEFINED_SHIFT, 1
+ .endif
+ .if SPILL_MASK
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+
+ // Preload coefficients if possible
+ .if TOTAL_TAPS <= 4
+ .set OFFSET_CO, 0
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .rept \fir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .rept \iir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .endif
+
+ // Move mask/shift to final positions if necessary
+ // Need to do this after preloading, because in some cases we
+ // reuse the coefficient pointer register
+ .if SHUFFLE_SHIFT
+ mov SHIFT, ST0
+ .endif
+ .if SHUFFLE_MASK
+ mov MASK, ST1
+ .endif
+
+ // Begin loop
+01:
+ .if TOTAL_TAPS == 0
+ // Things simplify a lot in this case
+ // In fact this could be pipelined further if it's worth it...
+ ldr ST0, [PSAMP]
+ subs I, I, #1
+ .if !\mask_minus1
+ and ST0, ST0, MASK
+ .endif
+ str ST0, [PST, #-4]!
+ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST0, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .else
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .set LOAD_BANK, 0
+ .set FIR_REMAIN, \fir_taps
+ .set IIR_REMAIN, \iir_taps
+ .if FIR_REMAIN == 0 // only IIR terms
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .else
+ .set OFFSET_CO, 0
+ .set OFFSET_ST, 0
+ .endif
+ .set MUL_REG, LOAD_REG
+ .set COUNTER, 0
+ .rept TOTAL_TAPS + 2
+ // Do load(s)
+ .if FIR_REMAIN != 0 || IIR_REMAIN != 0
+ .if COUNTER == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif COUNTER == 1 && (\fir_taps & 1) == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif LOAD_BANK == 0
+ .if TOTAL_TAPS > 4
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .else
+ loadd CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ .endif
+ .set LOAD_BANK, 1
+ .else
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .else
+ loadd ST, LOAD_REG, PST, OFFSET_ST
+ inc 2
+ .endif
+ .set LOAD_BANK, 0
+ .endif
+ .endif
+
+ // Do interleaved multiplies, slightly delayed
+ .if COUNTER >= 2
+ multiply MUL_REG, COUNTER > 2, !\shift_0
+ .set MUL_REG, (MUL_REG + 1) & 3
+ .endif
+ .set COUNTER, COUNTER + 1
+ .endr
+
+ // Post-process the result of the multiplies
+ .if SPILL_SHIFT
+ ldr SHIFT, [sp, #9*4 + 0*4]
+ .endif
+ .if SPILL_MASK
+ ldr MASK, [sp, #9*4 + 1*4]
+ .endif
+ ldr ST2, [PSAMP]
+ subs I, I, #1
+ .if \shift_8
+ mov AC0, AC0, lsr #8
+ orr AC0, AC0, AC1, lsl #24
+ .elseif !\shift_0
+ rsb ST3, SHIFT, #32
+ mov AC0, AC0, lsr SHIFT
+ orr AC0, AC0, AC1, lsl ST3
+ .endif
+ .if \mask_minus1
+ add ST3, ST2, AC0
+ .else
+ add ST2, ST2, AC0
+ and ST3, ST2, MASK
+ sub ST2, ST3, AC0
+ .endif
+ str ST3, [PST, #-4]!
+ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST3, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .endif
+ b 99f
+
+ .if DEFINED_SHIFT
+ .unreq SHIFT
+ .endif
+ .if DEFINED_MASK
+ .unreq MASK
+ .endif
+.endm
+
+.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
+ ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
+ .word 0
+ .word 70f
+ .word 71f
+ .word 72f
+ .word 73f
+ .word 74f
+ .word 75f
+ .if \iir_taps <= 2
+ .word 76f
+ .if \iir_taps <= 1
+ .word 77f
+ .if \iir_taps == 0
+ .word 78f
+ .endif
+ .endif
+ .endif
+70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
+71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
+72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
+73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
+74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
+75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
+ .if \iir_taps <= 2
+76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
+ .if \iir_taps <= 1
+77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
+ .if \iir_taps == 0
+78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
+ ldr pc, [pc, a4, LSL #2] // irorder is in range 0-3
+ .word 0
+ .word 60f
+ .word 61f
+ .word 62f
+ .word 63f
+60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
+61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
+62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
+63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
+.endm
+
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ * int firorder, int iirorder,
+ * unsigned int filter_shift, int32_t mask,
+ * int blocksize, int32_t *sample_buffer);
+ */
+function ff_mlp_filter_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {ST0,ST1,I,PSAMP}
+ cmp ST1, #-1
+ bne 30f
+ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 20f
+ bcs 10f
+ switch_on_iir_taps 1, 1, 0
+10: switch_on_iir_taps 1, 0, 1
+20: switch_on_iir_taps 1, 0, 0
+30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 50f
+ bcs 40f
+ switch_on_iir_taps 0, 1, 0
+40: switch_on_iir_taps 0, 0, 1
+50: switch_on_iir_taps 0, 0, 0
+99: pop {v1-fp,pc}
+endfunc
+
+ .unreq PST
+ .unreq PCO
+ .unreq AC0
+ .unreq AC1
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq ST0
+ .unreq ST1
+ .unreq ST2
+ .unreq ST3
+ .unreq I
+ .unreq PSAMP
+
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
new file mode 100644
index 0000000..f0ea285
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mlpdsp.h"
+
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer);
+
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+{
+ c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index b96200e..151cf83 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ if (ARCH_ARM)
+ ff_mlpdsp_init_arm(c);
if (ARCH_X86)
ff_mlpdsp_init_x86(c);
}
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index 995f72a..c985a17 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
void ff_mlpdsp_init_x86(MLPDSPContext *c);

#endif /* AVCODEC_MLPDSP_H */
--
1.7.5.4
Diego Biurrun
2014-03-19 17:49:05 UTC
Permalink
Post by Ben Avison
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,9 +21,13 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
+OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
The truehd decoder selects the mlp decoder, so this second entry is
redundant, drop it.

Diego
Ben Avison
2014-03-19 17:24:26 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
libavcodec/mlpdsp.c | 36 ++++++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
3 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index b9d1704..9dde60e 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -360,6 +360,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
else
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32,
+ m->substream[m->max_decoded_substream].ch_assign,
+ m->substream[m->max_decoded_substream].output_shift);

m->params_valid = 1;
for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
@@ -588,6 +592,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
if (substr == m->max_decoded_substream) {
m->avctx->channels = s->max_matrix_channel + 1;
m->avctx->channel_layout = s->ch_layout;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32,
+ s->ch_assign,
+ s->output_shift);
}

return 0;
@@ -818,9 +826,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
return ret;

if (s->param_presence_flags & PARAM_OUTSHIFT)
- if (get_bits1(gbp))
+ if (get_bits1(gbp)) {
for (ch = 0; ch <= s->max_matrix_channel; ch++)
s->output_shift[ch] = get_sbits(gbp, 4);
+ if (substr == m->max_decoded_substream)
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32,
+ s->ch_assign,
+ s->output_shift);
+ }

if (s->param_presence_flags & PARAM_QUANTSTEP)
if (get_bits1(gbp))
@@ -1019,9 +1033,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
{
AVCodecContext *avctx = m->avctx;
SubStream *s = &m->substream[substr];
- unsigned int i, out_ch = 0;
- int32_t *data_32;
- int16_t *data_16;
int ret;
int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);

@@ -1041,19 +1052,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
return ret;
}
- data_32 = (int32_t *)frame->data[0];
- data_16 = (int16_t *)frame->data[0];
-
- for (i = 0; i < s->blockpos; i++) {
- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
- int mat_ch = s->ch_assign[out_ch];
- int32_t sample = m->sample_buffer[i][mat_ch]
- << s->output_shift[mat_ch];
- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
- if (is32) *data_32++ = sample << 8;
- else *data_16++ = sample >> 8;
- }
- }
+ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
+ m->sample_buffer,
+ frame->data[0],
+ s->blockpos,
+ s->max_matrix_channel,
+ is32,
+ s->ch_assign,
+ s->output_shift);

/* Update matrix encoding side data */
if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index dfa13af..b3b5ffd 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,46 @@ void ff_mlp_rematrix_channel(int32_t *samples,
}
}

+static int32_t (*mlp_select_pack_output(uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift))(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *)
+{
+ return ff_mlp_pack_output;
+}
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift)
+{
+ unsigned int i, out_ch = 0;
+ int32_t *data_32 = (int32_t *)data;
+ int16_t *data_16 = (int16_t *)data;
+
+ for (i = 0; i < blockpos; i++) {
+ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
+ int mat_ch = ch_assign[out_ch];
+ int32_t sample = sample_buffer[i][mat_ch]
+ << output_shift[mat_ch];
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32) *data_32++ = sample << 8;
+ else *data_16++ = sample >> 8;
+ }
+ }
+ return lossless_check_data;
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+ c->mlp_select_pack_output = mlp_select_pack_output;
+ c->mlp_pack_output = ff_mlp_pack_output;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index bd864d9..7b7640e 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -23,6 +23,7 @@
#define AVCODEC_MLPDSP_H

#include <stdint.h>
+#include "mlp.h"

void ff_mlp_rematrix_channel(int32_t *samples,
const int32_t *coeffs,
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
+ int32_t (*(*mlp_select_pack_output)(uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift))(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *);
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Diego Biurrun
2014-03-19 17:42:01 UTC
Permalink
Post by Ben Avison
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,46 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
This function is not used outside of the file, so it can be made
static and the ff_ prefix can be removed.
Post by Ben Avison
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift)
+{
+ unsigned int i, out_ch = 0;
+ int32_t *data_32 = (int32_t *)data;
+ int16_t *data_16 = (int16_t *)data;
pointless void* casts
Post by Ben Avison
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32) *data_32++ = sample << 8;
+ else *data_16++ = sample >> 8;
Please break the lines.

Diego
James Almer
2014-03-19 18:18:01 UTC
Permalink
Post by Ben Avison
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index bd864d9..7b7640e 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -23,6 +23,7 @@
#define AVCODEC_MLPDSP_H
#include <stdint.h>
+#include "mlp.h"
void ff_mlp_rematrix_channel(int32_t *samples,
const int32_t *coeffs,
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
+ int32_t (*(*mlp_select_pack_output)(uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift))(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *);
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift);
} MLPDSPContext;
void ff_mlpdsp_init(MLPDSPContext *c);
Please put pointers first if possible, like you did for mlp_rematrix_channel.
Something like

+ int32_t (*mlp_pack_output)(int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ int32_t lossless_check_data,
+ uint16_t blockpos,
+ uint8_t max_matrix_channel,
+ int is32);
Ben Avison
2014-03-19 17:24:27 UTC
Permalink
Profiling results for overall decode and the output_data function in
particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/mlpdsp_arm.S | 503 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 64 +++++
2 files changed, 567 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
index 10008fe..338d323 100644
--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -98,6 +98,26 @@ A .endif
.endif
.endm

+.macro loadregoffsh2 group, index, base, offgroup, offindex
+ .altmacro
+ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
+ .noaltmacro
+.endm
+
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
+ ldr \group\index, [\base, \offgroup\offindex, lsl #2]
+.endm
+
+.macro eorlslreg check, data, group, index
+ .altmacro
+ eorlslreg_ \check, \data, \group, %(\index)
+ .noaltmacro
+.endm
+
+.macro eorlslreg_ check, data, group, index
+ eor \check, \check, \data, lsl \group\index
+.endm
+
// A macro to update the load register number and load offsets

.macro inc howmany
@@ -659,3 +679,486 @@ endfunc
.unreq LSB
.unreq DCH
.unreq MASK
+
+/********************************************************************/
+
+.macro decr_modulo var, by, modulus
+ .set \var, \var - \by
+ .if \var == 0
+ .set \var, \modulus
+ .endif
+.endm
+
+ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
+ .else // size == 4
+ .if IDX1 > 4 || \channels==8
+ ldm IN!, {\r0, \r1, \r2, \r3}
+ .else
+ ldm IN, {\r0, \r1, \r2, \r3}
+ .if !\pointer_dead
+ add IN, IN, #(4 + 8 - \channels) * 4
+ .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ .if IDX1 > 2
+ ldm IN!, {\r2, \r3}
+ .else
+//A .ifc \r2, ip
+//A .if \pointer_dead
+//A ldm IN, {\r2, \r3}
+//A .else
+//A ldr \r2, [IN], #4
+//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
+//A .endif
+//A .else
+ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
+//A .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+.macro implement_pack inorder, channels, shift
+.if \inorder
+.ifc \shift, mixed
+
+CHECK .req a1
+IN .req a2
+OUT .req a3
+COUNT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+SHIFT0 .req v5
+SHIFT1 .req v6
+SHIFT2 .req sl
+SHIFT3 .req fp
+SHIFT4 .req ip
+SHIFT5 .req lr
+
+ .macro output4words
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
+ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
+ .if \channels == 2
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .elseif \channels == 6
+ .if IDX2 == 6
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .elseif IDX2 == 2
+ lsl DAT0, SHIFT4
+ lsl DAT1, SHIFT5
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .else // IDX2 == 4
+ lsl DAT0, SHIFT2
+ lsl DAT1, SHIFT3
+ lsl DAT2, SHIFT4
+ lsl DAT3, SHIFT5
+ .endif
+ .elseif \channels == 8
+ .if IDX2 == 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ .else
+ uxtb SHIFT0, SHIFT5, ror #0
+ uxtb SHIFT1, SHIFT5, ror #8
+ uxtb SHIFT2, SHIFT5, ror #16
+ uxtb SHIFT3, SHIFT5, ror #24
+ .endif
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .endif
+ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_arm, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr SHIFT0, [sp, #(9+3)*4] // get output_shift from stack
+ ldr SHIFT1, =0x08080808
+ ldr SHIFT4, [SHIFT0]
+ .if \channels == 2
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ .else
+ ldr SHIFT5, [SHIFT0, #4]
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uadd8 SHIFT5, SHIFT5, SHIFT1
+ .if \channels == 6
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ uxtb SHIFT4, SHIFT5, ror #0
+ uxtb SHIFT5, SHIFT5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq IN
+ .unreq OUT
+ .unreq COUNT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq SHIFT0
+ .unreq SHIFT1
+ .unreq SHIFT2
+ .unreq SHIFT3
+ .unreq SHIFT4
+ .unreq SHIFT5
+
+.else // not mixed
+
+CHECK .req a1
+IN .req a2
+OUT .req a3
+COUNT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+DAT4 .req v5
+DAT5 .req v6
+DAT6 .req sl // use these rather than the otherwise unused
+DAT7 .req fp // ip and lr so that we can load them usinf LDRD
+
+ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
+ .if \head
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ .endif
+ .if \head
+ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {\r4, \r5, \r6, \r7}
+ .endif
+ .if \head
+ lsl \r0, #8 + \shift
+ lsl \r1, #8 + \shift
+ lsl \r2, #8 + \shift
+ lsl \r3, #8 + \shift
+ .endif
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_arm, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bxlo lr
+ push {v1-v6,sl,fp,lr}
+ .set IDX1, \channels
+ .set IDX2, \channels
+ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+0: beq 1f
+ .rept WORDS_PER_LOOP / 8
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+1:
+ .rept WORDS_PER_LOOP / 8 - 1
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
+ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ pop {v1-v6,sl,fp,pc}
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq IN
+ .unreq OUT
+ .unreq COUNT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+
+.endif // mixed
+.else // not inorder
+.ifc \shift, mixed
+
+// This case not currently handled
+
+.else // not mixed
+
+CHECK .req a1
+IN .req a2
+OUT .req a3
+COUNT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+CHAN0 .req v5
+CHAN1 .req v6
+CHAN2 .req sl
+CHAN3 .req fp
+CHAN4 .req ip
+CHAN5 .req lr
+
+ .macro output4words
+ .if \channels == 8
+ .if IDX1 == 8
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ .else
+ uxtb CHAN0, CHAN5, ror #0
+ uxtb CHAN1, CHAN5, ror #8
+ uxtb CHAN2, CHAN5, ror #16
+ uxtb CHAN3, CHAN5, ror #24
+ .endif
+ ldr DAT0, [IN, CHAN0, lsl #2]
+ ldr DAT1, [IN, CHAN1, lsl #2]
+ ldr DAT2, [IN, CHAN2, lsl #2]
+ ldr DAT3, [IN, CHAN3, lsl #2]
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ decr_modulo IDX1, 4, \channels
+ .else
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ .if SIZE_GROUP1 == 2
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ add IN, IN, #8*4
+ .else // SIZE_GROUP1 == 4
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP1, \channels
+ .if SIZE_GROUP2 == 2
+ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
+ .if IDX1 == 2
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP2, \channels
+ .endif
+ .if \channels == 8 // in this case we can corrupt CHAN0-3
+ rsb CHAN0, CHAN0, #8
+ rsb CHAN1, CHAN1, #8
+ rsb CHAN2, CHAN2, #8
+ rsb CHAN3, CHAN3, #8
+ lsl DAT0, #8 + \shift
+ lsl DAT1, #8 + \shift
+ lsl DAT2, #8 + \shift
+ lsl DAT3, #8 + \shift
+ eor CHECK, CHECK, DAT0, lsr CHAN0
+ eor CHECK, CHECK, DAT1, lsr CHAN1
+ eor CHECK, CHECK, DAT2, lsr CHAN2
+ eor CHECK, CHECK, DAT3, lsr CHAN3
+ .else
+ .if \shift != 0
+ lsl DAT0, #\shift
+ lsl DAT1, #\shift
+ lsl DAT2, #\shift
+ lsl DAT3, #\shift
+ .endif
+ bic DAT0, DAT0, #0xff000000
+ bic DAT1, DAT1, #0xff000000
+ bic DAT2, DAT2, #0xff000000
+ bic DAT3, DAT3, #0xff000000
+ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ lsl DAT0, #8
+ lsl DAT1, #8
+ lsl DAT2, #8
+ lsl DAT3, #8
+ .endif
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_arm, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr CHAN0, [sp, #(9+2)*4] // get ch_assign from stack
+ ldr CHAN4, [CHAN0]
+ .if \channels == 2
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ .else
+ ldr CHAN5, [CHAN0, #4]
+ .if \channels == 6
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ uxtb CHAN4, CHAN5, ror #0
+ uxtb CHAN5, CHAN5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq IN
+ .unreq OUT
+ .unreq COUNT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq CHAN0
+ .unreq CHAN1
+ .unreq CHAN2
+ .unreq CHAN3
+ .unreq CHAN4
+ .unreq CHAN5
+
+.endif // mixed
+.endif // inorder
+.endm // implement_pack
+
+.macro pack_channels inorder, channels
+ implement_pack \inorder, \channels, 0
+ implement_pack \inorder, \channels, 1
+ implement_pack \inorder, \channels, 2
+ implement_pack \inorder, \channels, 3
+ implement_pack \inorder, \channels, 4
+ implement_pack \inorder, \channels, 5
+ implement_pack \inorder, \channels, mixed
+.endm
+
+.macro pack_order inorder
+ pack_channels \inorder, 2
+ pack_channels \inorder, 6
+ pack_channels \inorder, 8
+.endm
+
+ pack_order 0
+ pack_order 1
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 268dfdd..2d8b98d 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -41,8 +41,72 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+#define DECLARE_PACK(order,channels,shift) \
+ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_arm(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *);
+#define ENUMERATE_PACK(order,channels,shift) \
+ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_arm,
+#define PACK_CHANNELS(macro,order,channels) \
+ macro(order,channels,0) \
+ macro(order,channels,1) \
+ macro(order,channels,2) \
+ macro(order,channels,3) \
+ macro(order,channels,4) \
+ macro(order,channels,5) \
+ macro(order,channels,mixed)
+#define PACK_ORDER(macro,order) \
+ PACK_CHANNELS(macro,order,2) \
+ PACK_CHANNELS(macro,order,6) \
+ PACK_CHANNELS(macro,order,8)
+#define PACK_ALL(macro) \
+ PACK_ORDER(macro,outof) \
+ PACK_ORDER(macro,in)
+PACK_ALL(DECLARE_PACK)
+
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_arm 0
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_arm 0
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_arm 0
+
+static int32_t (*mlp_select_pack_output_arm(uint8_t max_matrix_channel,
+ int is32,
+ uint8_t *ch_assign,
+ int8_t *output_shift))(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *)
+{
+ int ch_index;
+ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
+ int inorder = 1;
+ static int32_t (*const routine[2*3*7])(int32_t, int32_t (*)[], void *, uint16_t, uint8_t, int, uint8_t*, int8_t *) = {
+ PACK_ALL(ENUMERATE_PACK)
+ };
+ int i;
+
+ if (!is32) // don't support 16-bit output (it's not used by TrueHD)
+ return ff_mlp_pack_output;
+
+ switch (max_matrix_channel) {
+ case 1: ch_index = 0; break;
+ case 5: ch_index = 1; break;
+ case 7: ch_index = 2; break;
+ default: return ff_mlp_pack_output;
+ }
+
+ for (i = 0; i <= max_matrix_channel; i++) {
+ if (shift != 6 && output_shift[i] != shift)
+ shift = 6; // indicate mixed shifts
+ if (ch_assign[i] != i)
+ inorder = 0;
+ }
+ if (shift == 6 && !inorder)
+ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
+
+ return routine[(inorder*3+ch_index)*7+shift];
+}
+
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
+ int cpu_flags = av_get_cpu_flags();
+
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
+ c->mlp_select_pack_output = mlp_select_pack_output_arm;
}
--
1.7.5.4
Diego Biurrun
2014-03-19 17:46:09 UTC
Permalink
Post by Ben Avison
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -41,8 +41,72 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
+
+ switch (max_matrix_channel) {
+ case 1: ch_index = 0; break;
+ case 5: ch_index = 1; break;
+ case 7: ch_index = 2; break;
+ default: return ff_mlp_pack_output;
Please break the lines.
Post by Ben Avison
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
+ int cpu_flags = av_get_cpu_flags();
+
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
+ c->mlp_select_pack_output = mlp_select_pack_output_arm;
If the function is specific to ARMv6, it should have an _armv6 suffix,
reside in a separate file and be compiled conditionally.

Diego
Martin Storsjö
2014-03-19 19:18:25 UTC
Permalink
truehd: add hand-scheduled ARM asm version of mlp_filter_channel.
truehd: break out part of rematrix_channels into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of
ff_mlp_rematrix_channel.
truehd: tune VLC decoding for ARM.
truehd: break out part of output_data into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of ff_mlp_pack_output.
libavcodec/arm/Makefile | 4 +
libavcodec/arm/mlpdsp_arm.S | 1164 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 112 ++++
libavcodec/mlpdec.c | 90 ++--
libavcodec/mlpdsp.c | 71 +++
libavcodec/mlpdsp.h | 46 ++
6 files changed, 1442 insertions(+), 45 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
FWIW, this doesn't build properly with gas-preprocessor at the moment - I
or someone else needs to get that sorted out before this can be merged.

// Martin
Ben Avison
2014-03-19 19:41:03 UTC
Permalink
[Belatedly changing out of digest mode - hope this doesn't screw up
people's threading too much...]
Post by Diego Biurrun
Post by Ben Avison
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
+void ff_mlp_rematrix_channel(int32_t *samples,
This is not used outside of the file, so it should be a static function
and not have a ff_ prefix.
It is called from outside the file: from arm/mlpdsp_arm.S line 663. It's
used as a fallback for rare cases that aren't handled by the assembly.

Ben
Ben Avison
2014-03-19 19:43:49 UTC
Permalink
Post by Diego Biurrun
Post by Ben Avison
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,46 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
This function is not used outside of the file, so it can be made
static and the ff_ prefix can be removed.
It's used from several places in arm/mlpdsp_init_arm.c.
Post by Diego Biurrun
Post by Ben Avison
+ int32_t *data_32 = (int32_t *)data;
+ int16_t *data_16 = (int16_t *)data;
pointless void* casts
Fair enough, those were cut-and-pastes from their original location.
Post by Diego Biurrun
Post by Ben Avison
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32) *data_32++ = sample << 8;
+ else *data_16++ = sample >> 8;
Please break the lines.
OK, can do.

Ben
Diego Biurrun
2014-03-20 11:38:28 UTC
Permalink
Post by Ben Avison
Post by Diego Biurrun
Post by Ben Avison
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,46 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+ int32_t *data_32 = (int32_t *)data;
+ int16_t *data_16 = (int16_t *)data;
pointless void* casts
Fair enough, those were cut-and-pastes from their original location.
Which location? I'll change them ..

Diego
Ben Avison
2014-03-20 19:04:26 UTC
Permalink
Post by Diego Biurrun
Post by Ben Avison
Post by Diego Biurrun
Post by Ben Avison
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,46 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+ int32_t *data_32 = (int32_t *)data;
+ int16_t *data_16 = (int16_t *)data;
pointless void* casts
Fair enough, those were cut-and-pastes from their original location.
Which location? I'll change them ..
Those (and the lines a bit further down the same function where you
objected to the formatting) were moved from rematrix_channels() in
mlpdec.c. My patch series already deletes the offenders from their
original position. :)

Ben
Ben Avison
2014-03-20 00:48:05 UTC
Permalink
An updated series taking into account comments to date.

Ben Avison (6):
truehd: add hand-scheduled ARM asm version of mlp_filter_channel.
truehd: break out part of rematrix_channels into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of
ff_mlp_rematrix_channel.
truehd: tune VLC decoding for ARM.
truehd: break out part of output_data into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of ff_mlp_pack_output.

libavcodec/arm/Makefile | 3 +
libavcodec/arm/mlpdsp_arm.S | 665 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_armv6.S | 526 ++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 119 +++++++
libavcodec/mlpdec.c | 90 +++---
libavcodec/mlpdsp.c | 73 +++++
libavcodec/mlpdsp.h | 46 +++
7 files changed, 1477 insertions(+), 45 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
--
1.7.5.4
Ben Avison
2014-03-20 00:48:06 UTC
Permalink
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
function in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%

Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/mlpdsp_arm.S | 435 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 +++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 476 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8bdccbd..c6cc96e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,6 +21,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..9e0bf57
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
+
+#define MAX_CHANNELS 8
+#define MAX_FIR_ORDER 8
+#define MAX_IIR_ORDER 4
+#define MAX_RATEFACTOR 4
+#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
+
+PST .req a1
+PCO .req a2
+AC0 .req a3
+AC1 .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+ST0 .req v5
+ST1 .req v6
+ST2 .req sl
+ST3 .req fp
+I .req ip
+PSAMP .req lr
+
+
+// Some macros that do loads/multiplies where the register number is determined
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
+
+.macro load group, index, base, offset
+ .altmacro
+ load_ \group, %(\index), \base, \offset
+ .noaltmacro
+.endm
+
+.macro load_ group, index, base, offset
+ ldr \group\index, [\base, #\offset]
+.endm
+
+.macro loadd group, index, base, offset
+ .altmacro
+ loadd_ \group, %(\index), %(\index+1), \base, \offset
+ .noaltmacro
+.endm
+
+.macro loadd_ group, index0, index1, base, offset
+ .if offset >= 256 // could remove this check for Thumb builds if they're supported in future
+ ldr \group\index0, [\base, #\offset]
+ ldr \group\index1, [\base, #(\offset) + 4]
+ .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+ .endif
+.endm
+
+.macro multiply index, accumulate, long
+ .altmacro
+ multiply_ %(\index), \accumulate, \long
+ .noaltmacro
+.endm
+
+.macro multiply_ index, accumulate, long
+ .if \long
+ .if \accumulate
+ smlal AC0, AC1, CO\index, ST\index
+ .else
+ smull AC0, AC1, CO\index, ST\index
+ .endif
+ .else
+ .if \accumulate
+ mla AC0, CO\index, ST\index, AC0
+ .else
+ mul AC0, CO\index, ST\index
+ .endif
+ .endif
+.endm
+
+// A macro to update the load register number and load offsets
+
+.macro inc howmany
+ .set LOAD_REG, (LOAD_REG + \howmany) & 3
+ .set OFFSET_CO, OFFSET_CO + 4 * \howmany
+ .set OFFSET_ST, OFFSET_ST + 4 * \howmany
+ .if FIR_REMAIN > 0
+ .set FIR_REMAIN, FIR_REMAIN - \howmany
+ .if FIR_REMAIN == 0
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .endif
+ .elseif IIR_REMAIN > 0
+ .set IIR_REMAIN, IIR_REMAIN - \howmany
+ .endif
+.endm
+
+// Macro to implement the inner loop for one specific combination of parameters
+
+.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
+ .set TOTAL_TAPS, \iir_taps + \fir_taps
+
+ // Deal with register allocation...
+ .set DEFINED_SHIFT, 0
+ .set DEFINED_MASK, 0
+ .set SHUFFLE_SHIFT, 0
+ .set SHUFFLE_MASK, 0
+ .set SPILL_SHIFT, 0
+ .set SPILL_MASK, 0
+ .if TOTAL_TAPS == 0
+ // Little register pressure in this case - just keep MASK where it was
+ .if !\mask_minus1
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+ .else
+ .if \shift_0
+ .if !\mask_minus1
+ // AC1 is unused with shift 0
+ MASK .req AC1
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif \shift_8
+ .if !\mask_minus1
+ .if TOTAL_TAPS <= 4
+ // All coefficients are preloaded (so pointer not needed)
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .else
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .else // shift not 0 or 8
+ .if TOTAL_TAPS <= 3
+ // All coefficients are preloaded, and at least one CO register is unused
+ .if \fir_taps & 1
+ SHIFT .req CO0
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .else
+ SHIFT .req CO3
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .endif
+ .if !\mask_minus1
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif TOTAL_TAPS == 4
+ // All coefficients are preloaded
+ SHIFT .req PCO
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .else
+ .set SPILL_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .endif
+ .endif
+ .if SPILL_SHIFT
+ SHIFT .req ST0
+ .set DEFINED_SHIFT, 1
+ .endif
+ .if SPILL_MASK
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+
+ // Preload coefficients if possible
+ .if TOTAL_TAPS <= 4
+ .set OFFSET_CO, 0
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .rept \fir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .rept \iir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .endif
+
+ // Move mask/shift to final positions if necessary
+ // Need to do this after preloading, because in some cases we
+ // reuse the coefficient pointer register
+ .if SHUFFLE_SHIFT
+ mov SHIFT, ST0
+ .endif
+ .if SHUFFLE_MASK
+ mov MASK, ST1
+ .endif
+
+ // Begin loop
+01:
+ .if TOTAL_TAPS == 0
+ // Things simplify a lot in this case
+ // In fact this could be pipelined further if it's worth it...
+ ldr ST0, [PSAMP]
+ subs I, I, #1
+ .if !\mask_minus1
+ and ST0, ST0, MASK
+ .endif
+ str ST0, [PST, #-4]!
+ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST0, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .else
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .set LOAD_BANK, 0
+ .set FIR_REMAIN, \fir_taps
+ .set IIR_REMAIN, \iir_taps
+ .if FIR_REMAIN == 0 // only IIR terms
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .else
+ .set OFFSET_CO, 0
+ .set OFFSET_ST, 0
+ .endif
+ .set MUL_REG, LOAD_REG
+ .set COUNTER, 0
+ .rept TOTAL_TAPS + 2
+ // Do load(s)
+ .if FIR_REMAIN != 0 || IIR_REMAIN != 0
+ .if COUNTER == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif COUNTER == 1 && (\fir_taps & 1) == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif LOAD_BANK == 0
+ .if TOTAL_TAPS > 4
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .else
+ loadd CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ .endif
+ .set LOAD_BANK, 1
+ .else
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .else
+ loadd ST, LOAD_REG, PST, OFFSET_ST
+ inc 2
+ .endif
+ .set LOAD_BANK, 0
+ .endif
+ .endif
+
+ // Do interleaved multiplies, slightly delayed
+ .if COUNTER >= 2
+ multiply MUL_REG, COUNTER > 2, !\shift_0
+ .set MUL_REG, (MUL_REG + 1) & 3
+ .endif
+ .set COUNTER, COUNTER + 1
+ .endr
+
+ // Post-process the result of the multiplies
+ .if SPILL_SHIFT
+ ldr SHIFT, [sp, #9*4 + 0*4]
+ .endif
+ .if SPILL_MASK
+ ldr MASK, [sp, #9*4 + 1*4]
+ .endif
+ ldr ST2, [PSAMP]
+ subs I, I, #1
+ .if \shift_8
+ mov AC0, AC0, lsr #8
+ orr AC0, AC0, AC1, lsl #24
+ .elseif !\shift_0
+ rsb ST3, SHIFT, #32
+ mov AC0, AC0, lsr SHIFT
+ orr AC0, AC0, AC1, lsl ST3
+ .endif
+ .if \mask_minus1
+ add ST3, ST2, AC0
+ .else
+ add ST2, ST2, AC0
+ and ST3, ST2, MASK
+ sub ST2, ST3, AC0
+ .endif
+ str ST3, [PST, #-4]!
+ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST3, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .endif
+ b 99f
+
+ .if DEFINED_SHIFT
+ .unreq SHIFT
+ .endif
+ .if DEFINED_MASK
+ .unreq MASK
+ .endif
+.endm
+
+.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
+ ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
+ .word 0
+ .word 70f
+ .word 71f
+ .word 72f
+ .word 73f
+ .word 74f
+ .if \iir_taps <= 3
+ .word 75f
+ .if \iir_taps <= 2
+ .word 76f
+ .if \iir_taps <= 1
+ .word 77f
+ .if \iir_taps == 0
+ .word 78f
+ .endif
+ .endif
+ .endif
+ .endif
+70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
+71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
+72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
+73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
+74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
+ .if \iir_taps <= 3
+75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
+ .if \iir_taps <= 2
+76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
+ .if \iir_taps <= 1
+77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
+ .if \iir_taps == 0
+78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
+ .endif
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
+ ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4
+ .word 0
+ .word 60f
+ .word 61f
+ .word 62f
+ .word 63f
+ .word 64f
+60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
+61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
+62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
+63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
+64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
+.endm
+
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ * int firorder, int iirorder,
+ * unsigned int filter_shift, int32_t mask,
+ * int blocksize, int32_t *sample_buffer);
+ */
+function ff_mlp_filter_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {ST0,ST1,I,PSAMP}
+ cmp ST1, #-1
+ bne 30f
+ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 20f
+ bcs 10f
+ switch_on_iir_taps 1, 1, 0
+10: switch_on_iir_taps 1, 0, 1
+20: switch_on_iir_taps 1, 0, 0
+30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 50f
+ bcs 40f
+ switch_on_iir_taps 0, 1, 0
+40: switch_on_iir_taps 0, 0, 1
+50: switch_on_iir_taps 0, 0, 0
+99: pop {v1-fp,pc}
+endfunc
+
+ .unreq PST
+ .unreq PCO
+ .unreq AC0
+ .unreq AC1
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq ST0
+ .unreq ST1
+ .unreq ST2
+ .unreq ST3
+ .unreq I
+ .unreq PSAMP
+
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
new file mode 100644
index 0000000..f0ea285
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mlpdsp.h"
+
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer);
+
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+{
+ c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index b96200e..151cf83 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ if (ARCH_ARM)
+ ff_mlpdsp_init_arm(c);
if (ARCH_X86)
ff_mlpdsp_init_x86(c);
}
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index 995f72a..c985a17 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
void ff_mlpdsp_init_x86(MLPDSPContext *c);

#endif /* AVCODEC_MLPDSP_H */
--
1.7.5.4
Martin Storsjö
2014-03-20 07:33:10 UTC
Permalink
Post by Ben Avison
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%
Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/mlpdsp_arm.S | 435 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 +++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 476 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8bdccbd..c6cc96e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,6 +21,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..9e0bf57
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
Just to be clear, the tricks that don't work in thumb mode are
non-constant shifts, and jump tables with "ldr pc, [pc, ...]", right?

Forcing arm mode like this isn't ok in all configurations - e.g. when
building for WinRT/Windows Phone 8, you really have to build all of it in
thumb mode; the linker doesn't handle everything needed for mixing the
modes there.

Would it be acceptable to build and run this code only if CONFIG_THUMB is
disabled? That's the case for most raspberry pi builds at least, although
I guess it would lead to not using this code at all on other e.g. armv7
builds on linux where it still could have been beneficial?

// Martin
Janne Grunau
2014-03-20 10:06:41 UTC
Permalink
Post by Martin Storsjö
Post by Ben Avison
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..9e0bf57
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,435 @@
+/*
+
+#include "libavutil/arm/asm.S"
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
Just to be clear, the tricks that don't work in thumb mode are
non-constant shifts, and jump tables with "ldr pc, [pc, ...]",
right?
Forcing arm mode like this isn't ok in all configurations - e.g.
when building for WinRT/Windows Phone 8, you really have to build
all of it in thumb mode; the linker doesn't handle everything needed
for mixing the modes there.
Would it be acceptable to build and run this code only if
CONFIG_THUMB is disabled? That's the case for most raspberry pi
builds at least, although I guess it would lead to not using this
code at all on other e.g. armv7 builds on linux where it still could
have been beneficial?
We could add CONFIG_THUMB_INTERWORKING and disable that for
WinRT/Windows Phone 8.

Janne
Martin Storsjö
2014-03-20 12:27:18 UTC
Permalink
Post by Janne Grunau
Post by Martin Storsjö
Post by Ben Avison
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..9e0bf57
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,435 @@
+/*
+
+#include "libavutil/arm/asm.S"
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
Just to be clear, the tricks that don't work in thumb mode are
non-constant shifts, and jump tables with "ldr pc, [pc, ...]", right?
Forcing arm mode like this isn't ok in all configurations - e.g.
when building for WinRT/Windows Phone 8, you really have to build
all of it in thumb mode; the linker doesn't handle everything needed
for mixing the modes there.
Would it be acceptable to build and run this code only if
CONFIG_THUMB is disabled? That's the case for most raspberry pi
builds at least, although I guess it would lead to not using this
code at all on other e.g. armv7 builds on linux where it still could
have been beneficial?
We could add CONFIG_THUMB_INTERWORKING and disable that for
WinRT/Windows Phone 8.
Sure, that could also work.

// Martin
Ben Avison
2014-03-20 18:57:40 UTC
Permalink
Just to be clear, the tricks that don't work in thumb mode are non-
constant shifts, and jump tables with "ldr pc, [pc, ...]", right?
Yes, it looks like it. I admit, Thumb was something of an afterthought;
shortly before I released it I had a try at assembling it as Thumb and
saw multiple issues; switching to using ARM was the quick and easy
solution. Or so I thought...
Forcing arm mode like this isn't ok in all configurations - e.g. when
building for WinRT/Windows Phone 8, you really have to build all of it
in thumb mode; the linker doesn't handle everything needed for mixing
the modes there.
I would consider that a tools bug, myself. There's no architectural
reason why interworking wouldn't work on anything except a Cortex-M CPU
(which isn't the sort of thing you'd run libav on, especially a TrueHD
decoder - far too low power). My assembly functions are called through
function pointers, not BL statements, and ever since ARMv5 all loads to
the PC (which is how you use a function pointer) are treated as
interworking branches. Before ARMv5, the Thumb instruction set wouldn't
have been rich enough to assemble the sources anyway.

However, I doubt I'll have much luck persuading Microsoft of the error of
their ways, so I have made the following changes:
* added IT statements where necessary
* replaced the branch tables with TBH branch tables when targeting Thumb
* where possible, replaced the ALU-op-with-register-shifted-register
instructions with Thumb-compatible sequences when targeting Thumb

It wasn't possible to convert one family of functions, though - the 18
ff_mlp_pack_output_outoforder_*ch_*shift_armv6 functions. There were no
spare registers which could be used as temporaries there, so supporting
Thumb would mean a major change to register allocation, which I haven't
attempted. Instead, I have switched out those functions when CONFIG_THUMB
is defined. (Yes, I saw the discussion about adding an interworking
switch - I'm not sure it's warranted now that most of the optimisations
work in Thumb mode.)

Ben
Martin Storsjö
2014-03-20 20:51:54 UTC
Permalink
Post by Ben Avison
Just to be clear, the tricks that don't work in thumb mode are non-
constant shifts, and jump tables with "ldr pc, [pc, ...]", right?
Yes, it looks like it. I admit, Thumb was something of an afterthought;
shortly before I released it I had a try at assembling it as Thumb and
saw multiple issues; switching to using ARM was the quick and easy
solution. Or so I thought...
Forcing arm mode like this isn't ok in all configurations - e.g. when
building for WinRT/Windows Phone 8, you really have to build all of it
in thumb mode; the linker doesn't handle everything needed for mixing
the modes there.
I would consider that a tools bug, myself. There's no architectural
reason why interworking wouldn't work on anything except a Cortex-M CPU
(which isn't the sort of thing you'd run libav on, especially a TrueHD
decoder - far too low power). My assembly functions are called through
function pointers, not BL statements, and ever since ARMv5 all loads to
the PC (which is how you use a function pointer) are treated as
interworking branches. Before ARMv5, the Thumb instruction set wouldn't
have been rich enough to assemble the sources anyway.
Yeah, architecturally there shouldn't be any issue, but they've more or
less speced their environment to be thumb-only and that's all the tools
care about. I'm not familiar enough with all the intricacies with
interworking to figure out in which cases it works and what's missing in
practice - I've only tested it lightly and got the general picture of
"some things might work, but not everything".
Post by Ben Avison
However, I doubt I'll have much luck persuading Microsoft of the error of
* added IT statements where necessary
* replaced the branch tables with TBH branch tables when targeting Thumb
* where possible, replaced the ALU-op-with-register-shifted-register
instructions with Thumb-compatible sequences when targeting Thumb
It wasn't possible to convert one family of functions, though - the 18
ff_mlp_pack_output_outoforder_*ch_*shift_armv6 functions. There were no
spare registers which could be used as temporaries there, so supporting
Thumb would mean a major change to register allocation, which I haven't
attempted. Instead, I have switched out those functions when CONFIG_THUMB
is defined. (Yes, I saw the discussion about adding an interworking
switch - I'm not sure it's warranted now that most of the optimisations
work in Thumb mode.)
This sounds like a quite sensible thing to do.

Now it remains for me to fix up gas-preprocessor to the point that this
assembles correctly via that tool as well. As far as I've seen now with
the previous iterations, there's one issue about in which order the nested
ifs/macros are evaluated that need to be tweaked.

// Martin
Martin Storsjö
2014-03-24 12:08:45 UTC
Permalink
Post by Martin Storsjö
Now it remains for me to fix up gas-preprocessor to the point that this
assembles correctly via that tool as well. As far as I've seen now with the
previous iterations, there's one issue about in which order the nested
ifs/macros are evaluated that need to be tweaked.
There turned out to be a bit more than just one issue to fix to make this
work, but it seems to work ok for me now - I'll post the gas-preprocessor
patches and once they're in this shouldn't be any technical obstacle for
merging this any longer.

I've got a few minor details that need to be fixed in your patches though,
but nothing major.

// Martin
Ben Avison
2014-03-20 00:48:07 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index ed5a6ac..c0f2d6a 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -978,7 +978,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
{
SubStream *s = &m->substream[substr];
- unsigned int mat, src_ch, i;
+ unsigned int mat;
unsigned int maxchan;

maxchan = s->max_matrix_channel;
@@ -990,31 +990,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
}

for (mat = 0; mat < s->num_primitive_matrices; mat++) {
- int matrix_noise_shift = s->matrix_noise_shift[mat];
unsigned int dest_ch = s->matrix_out_ch[mat];
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
- int32_t *coeffs = s->matrix_coeff[mat];
- int index = s->num_primitive_matrices - mat;
- int index2 = 2 * index + 1;
-
- /* TODO: DSPContext? */
-
- for (i = 0; i < s->blockpos; i++) {
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
- int32_t *samples = m->sample_buffer[i];
- int64_t accum = 0;
-
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
-
- if (matrix_noise_shift) {
- index &= m->access_unit_size_pow2 - 1;
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
- index += index2;
- }
-
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
- }
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+ s->matrix_coeff[mat],
+ &m->bypassed_lsbs[0][mat],
+ m->noise_buffer,
+ s->num_primitive_matrices - mat,
+ dest_ch,
+ s->blockpos,
+ maxchan,
+ s->matrix_noise_shift[mat],
+ m->access_unit_size_pow2,
+ MSB_MASK(s->quant_step_size[dest_ch]));
}
}

diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 151cf83..dfa13af 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask)
+{
+ unsigned int src_ch, i;
+ int index2 = 2 * index + 1;
+ for (i = 0; i < blockpos; i++) {
+ int64_t accum = 0;
+
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+ if (matrix_noise_shift) {
+ index &= access_unit_size_pow2 - 1;
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
+ index += index2;
+ }
+
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+ bypassed_lsbs += MAX_CHANNELS;
+ samples += MAX_CHANNELS;
+ }
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@

#include <stdint.h>

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Ben Avison
2014-03-20 00:48:08 UTC
Permalink
Profiling results for overall audio decode and the rematrix_channels function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3%
6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant)
8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant)
8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant)
6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9%
6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3%
8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9%
8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/mlpdsp_arm.S | 230 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 12 ++
2 files changed, 242 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
index 9e0bf57..23d27e5 100644
--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -433,3 +433,233 @@ endfunc
.unreq I
.unreq PSAMP

+/********************************************************************/
+
+PSA .req a1 // samples
+PCO .req a2 // coeffs
+PBL .req a3 // bypassed_lsbs
+INDEX .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+SA0 .req v5
+SA1 .req v6
+SA2 .req sl
+SA3 .req fp
+AC0 .req ip
+AC1 .req lr
+NOISE .req SA0
+LSB .req SA1
+DCH .req SA2 // dest_ch
+MASK .req SA3
+
+ // INDEX is used as follows:
+ // bits 0..6 index2 (values up to 17, but wider so that we can
+ // add to index field without needing to mask)
+ // bits 7..14 i (values up to 160)
+ // bit 15 underflow detect for i
+ // bits 25..31 (if access_unit_size_pow2 == 128) \ index
+ // bits 26..31 (if access_unit_size_pow2 == 64) /
+
+.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
+ .if \maxchan == 1
+ // We can just leave the coefficients in registers in this case
+ ldrd CO0, CO1, [PCO]
+ .endif
+1:
+ .if \maxchan == 1
+ ldrd SA0, SA1, [PSA]
+ smull AC0, AC1, CO0, SA0
+ .elseif \maxchan == 5
+ ldr CO0, [PCO, #0]
+ ldr SA0, [PSA, #0]
+ ldr CO1, [PCO, #4]
+ ldr SA1, [PSA, #4]
+ ldrd CO2, CO3, [PCO, #8]
+ smull AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #8]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #16]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #16]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .else // \maxchan == 7
+ ldr CO2, [PCO, #0]
+ ldr SA2, [PSA, #0]
+ ldr CO3, [PCO, #4]
+ ldr SA3, [PSA, #4]
+ ldrd CO0, CO1, [PCO, #8]
+ smull AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #8]
+ smlal AC0, AC1, CO3, SA3
+ ldrd CO2, CO3, [PCO, #16]
+ smlal AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #16]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #24]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #24]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .endif
+ ldm sp, {NOISE, DCH, MASK}
+ smlal AC0, AC1, CO1, SA1
+ .if \shift != 0
+ .if \index_mask == 63
+ add NOISE, NOISE, INDEX, lsr #32-6
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-6
+ .else // \index_mask == 127
+ add NOISE, NOISE, INDEX, lsr #32-7
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-7
+ .endif
+ sub INDEX, INDEX, #1<<7
+ adds AC0, AC0, NOISE, lsl #\shift + 7
+ adc AC1, AC1, NOISE, asr #31
+ .else
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ sub INDEX, INDEX, #1<<7
+ .endif
+ add PSA, PSA, #MAX_CHANNELS*4
+ mov AC0, AC0, lsr #14
+ orr AC0, AC0, AC1, lsl #18
+ .if !\mask_minus1
+ and AC0, AC0, MASK
+ .endif
+ add AC0, AC0, LSB
+ tst INDEX, #1<<15
+ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
+ beq 1b
+ b 98f
+.endm
+
+.macro switch_on_maxchan shift, index_mask, mask_minus1
+ cmp v4, #5
+ blo 51f
+ beq 50f
+ implement_rematrix \shift, \index_mask, \mask_minus1, 7
+50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
+51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
+.endm
+
+.macro switch_on_mask shift, index_mask
+ cmp sl, #-1
+ bne 40f
+ switch_on_maxchan \shift, \index_mask, 1
+40: switch_on_maxchan \shift, \index_mask, 0
+.endm
+
+.macro switch_on_au_size shift
+ .if \shift == 0
+ switch_on_mask \shift, undefined
+ .else
+ teq v6, #64
+ bne 30f
+ orr INDEX, INDEX, v1, lsl #32-6
+ switch_on_mask \shift, 63
+30: orr INDEX, INDEX, v1, lsl #32-7
+ switch_on_mask \shift, 127
+ .endif
+.endm
+
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ * const int32_t *coeffs,
+ * const uint8_t *bypassed_lsbs,
+ * const int8_t *noise_buffer,
+ * int index,
+ * unsigned int dest_ch,
+ * uint16_t blockpos,
+ * unsigned int maxchan,
+ * int matrix_noise_shift,
+ * int access_unit_size_pow2,
+ * int32_t mask);
+ */
+function ff_mlp_rematrix_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {v1-sl}
+ teq v4, #1
+ teqne v4, #5
+ teqne v4, #7
+ bne 99f
+ teq v6, #64
+ teqne v6, #128
+ bne 99f
+ sub v2, v2, #MAX_CHANNELS
+ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
+ movs INDEX, v3, lsl #7
+ beq 98f // just in case, do nothing if blockpos = 0
+ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
+ adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
+ orr INDEX, INDEX, lr
+ // Switch on matrix_noise_shift: values 0 and 1 are
+ // disproportionately common so do those in a form the branch
+ // predictor can accelerate. Values can only go up to 15.
+ cmp v5, #1
+ beq 11f
+ blo 10f
+ ldr pc, [pc, v5, lsl #2]
+ .word 0
+ .word 0
+ .word 0
+ .word 12f
+ .word 13f
+ .word 14f
+ .word 15f
+ .word 16f
+ .word 17f
+ .word 18f
+ .word 19f
+ .word 20f
+ .word 21f
+ .word 22f
+ .word 23f
+ .word 24f
+ .word 25f
+10: switch_on_au_size 0
+11: switch_on_au_size 1
+12: switch_on_au_size 2
+13: switch_on_au_size 3
+14: switch_on_au_size 4
+15: switch_on_au_size 5
+16: switch_on_au_size 6
+17: switch_on_au_size 7
+18: switch_on_au_size 8
+19: switch_on_au_size 9
+20: switch_on_au_size 10
+21: switch_on_au_size 11
+22: switch_on_au_size 12
+23: switch_on_au_size 13
+24: switch_on_au_size 14
+25: switch_on_au_size 15
+
+98: add sp, sp, #3*4
+ pop {v1-fp,pc}
+99: // Can't handle these parameters, drop back to C
+ pop {v1-fp,lr}
+ b X(ff_mlp_rematrix_channel)
+endfunc
+
+ .unreq PSA
+ .unreq PCO
+ .unreq PBL
+ .unreq INDEX
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq SA0
+ .unreq SA1
+ .unreq SA2
+ .unreq SA3
+ .unreq AC0
+ .unreq AC1
+ .unreq NOISE
+ .unreq LSB
+ .unreq DCH
+ .unreq MASK
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index f0ea285..268dfdd 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);

av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
}
--
1.7.5.4
Ben Avison
2014-03-20 00:48:09 UTC
Permalink
Profiling on a Raspberry Pi revealed the best performance to correspond
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
---
libavcodec/mlpdec.c | 13 ++++++++++---
1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index c0f2d6a..b9d1704 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -37,9 +37,16 @@
#include "mlp_parser.h"
#include "mlpdsp.h"
#include "mlp.h"
+#include "config.h"

/** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
+#define VLC_BITS 5
+#define VLC_STATIC_SIZE 64
+#else
#define VLC_BITS 9
+#define VLC_STATIC_SIZE 512
+#endif

typedef struct SubStream {
/// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
@@ -190,13 +197,13 @@ static av_cold void init_static(void)
if (!huff_vlc[0].bits) {
INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
&ff_mlp_huffman_tables[0][0][1], 2, 1,
- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
&ff_mlp_huffman_tables[1][0][1], 2, 1,
- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
&ff_mlp_huffman_tables[2][0][1], 2, 1,
- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
}

ff_mlp_init_crc();
--
1.7.5.4
Ben Avison
2014-03-20 00:48:10 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
3 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index b9d1704..49353d9 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -360,6 +360,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
else
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
+ m->substream[m->max_decoded_substream].output_shift,
+ m->substream[m->max_decoded_substream].max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);

m->params_valid = 1;
for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
@@ -588,6 +592,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
if (substr == m->max_decoded_substream) {
m->avctx->channels = s->max_matrix_channel + 1;
m->avctx->channel_layout = s->ch_layout;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
}

return 0;
@@ -818,9 +826,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
return ret;

if (s->param_presence_flags & PARAM_OUTSHIFT)
- if (get_bits1(gbp))
+ if (get_bits1(gbp)) {
for (ch = 0; ch <= s->max_matrix_channel; ch++)
s->output_shift[ch] = get_sbits(gbp, 4);
+ if (substr == m->max_decoded_substream)
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+ }

if (s->param_presence_flags & PARAM_QUANTSTEP)
if (get_bits1(gbp))
@@ -1019,9 +1033,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
{
AVCodecContext *avctx = m->avctx;
SubStream *s = &m->substream[substr];
- unsigned int i, out_ch = 0;
- int32_t *data_32;
- int16_t *data_16;
int ret;
int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);

@@ -1041,19 +1052,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
return ret;
}
- data_32 = (int32_t *)frame->data[0];
- data_16 = (int16_t *)frame->data[0];
-
- for (i = 0; i < s->blockpos; i++) {
- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
- int mat_ch = s->ch_assign[out_ch];
- int32_t sample = m->sample_buffer[i][mat_ch]
- << s->output_shift[mat_ch];
- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
- if (is32) *data_32++ = sample << 8;
- else *data_16++ = sample >> 8;
- }
- }
+ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
+ s->blockpos,
+ m->sample_buffer,
+ frame->data[0],
+ s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ is32);

/* Update matrix encoding side data */
if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index dfa13af..aded554 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples,
}
}

+static int32_t (*mlp_select_pack_output(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ return ff_mlp_pack_output;
+}
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32)
+{
+ unsigned int i, out_ch = 0;
+ int32_t *data_32 = data;
+ int16_t *data_16 = data;
+
+ for (i = 0; i < blockpos; i++) {
+ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
+ int mat_ch = ch_assign[out_ch];
+ int32_t sample = sample_buffer[i][mat_ch]
+ << output_shift[mat_ch];
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32)
+ *data_32++ = sample << 8;
+ else
+ *data_16++ = sample >> 8;
+ }
+ }
+ return lossless_check_data;
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+ c->mlp_select_pack_output = mlp_select_pack_output;
+ c->mlp_pack_output = ff_mlp_pack_output;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index bd864d9..acd48fc 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -23,6 +23,7 @@
#define AVCODEC_MLPDSP_H

#include <stdint.h>
+#include "mlp.h"

void ff_mlp_rematrix_channel(int32_t *samples,
const int32_t *coeffs,
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
+ int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Ben Avison
2014-03-20 00:48:11 UTC
Permalink
Profiling results for overall decode and the output_data function in
particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/mlpdsp_armv6.S | 526 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 71 +++++
3 files changed, 598 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_armv6.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index c6cc96e..8d7e25a 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -51,6 +51,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
+ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
new file mode 100644
index 0000000..0959f79
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+// This code uses too many ARM-only tricks to easily assemble as Thumb
+.arm
+
+.macro loadregoffsh2 group, index, base, offgroup, offindex
+ .altmacro
+ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
+ .noaltmacro
+.endm
+
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
+ ldr \group\index, [\base, \offgroup\offindex, lsl #2]
+.endm
+
+.macro eorlslreg check, data, group, index
+ .altmacro
+ eorlslreg_ \check, \data, \group, %(\index)
+ .noaltmacro
+.endm
+
+.macro eorlslreg_ check, data, group, index
+ eor \check, \check, \data, lsl \group\index
+.endm
+
+.macro decr_modulo var, by, modulus
+ .set \var, \var - \by
+ .if \var == 0
+ .set \var, \modulus
+ .endif
+.endm
+
+ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
+ .else // size == 4
+ .if IDX1 > 4 || \channels==8
+ ldm IN!, {\r0, \r1, \r2, \r3}
+ .else
+ ldm IN, {\r0, \r1, \r2, \r3}
+ .if !\pointer_dead
+ add IN, IN, #(4 + 8 - \channels) * 4
+ .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ .if IDX1 > 2
+ ldm IN!, {\r2, \r3}
+ .else
+//A .ifc \r2, ip
+//A .if \pointer_dead
+//A ldm IN, {\r2, \r3}
+//A .else
+//A ldr \r2, [IN], #4
+//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
+//A .endif
+//A .else
+ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
+//A .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+.macro implement_pack inorder, channels, shift
+.if \inorder
+.ifc \shift, mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+SHIFT0 .req v5
+SHIFT1 .req v6
+SHIFT2 .req sl
+SHIFT3 .req fp
+SHIFT4 .req ip
+SHIFT5 .req lr
+
+ .macro output4words
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
+ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
+ .if \channels == 2
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .elseif \channels == 6
+ .if IDX2 == 6
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .elseif IDX2 == 2
+ lsl DAT0, SHIFT4
+ lsl DAT1, SHIFT5
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .else // IDX2 == 4
+ lsl DAT0, SHIFT2
+ lsl DAT1, SHIFT3
+ lsl DAT2, SHIFT4
+ lsl DAT3, SHIFT5
+ .endif
+ .elseif \channels == 8
+ .if IDX2 == 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ .else
+ uxtb SHIFT0, SHIFT5, ror #0
+ uxtb SHIFT1, SHIFT5, ror #8
+ uxtb SHIFT2, SHIFT5, ror #16
+ uxtb SHIFT3, SHIFT5, ror #24
+ .endif
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .endif
+ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
+ ldr SHIFT1, =0x08080808
+ ldr SHIFT4, [SHIFT0]
+ .if \channels == 2
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ .else
+ ldr SHIFT5, [SHIFT0, #4]
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uadd8 SHIFT5, SHIFT5, SHIFT1
+ .if \channels == 6
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ uxtb SHIFT4, SHIFT5, ror #0
+ uxtb SHIFT5, SHIFT5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq SHIFT0
+ .unreq SHIFT1
+ .unreq SHIFT2
+ .unreq SHIFT3
+ .unreq SHIFT4
+ .unreq SHIFT5
+
+.else // not mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+DAT4 .req v5
+DAT5 .req v6
+DAT6 .req sl // use these rather than the otherwise unused
+DAT7 .req fp // ip and lr so that we can load them usinf LDRD
+
+ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
+ .if \head
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ .endif
+ .if \head
+ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {\r4, \r5, \r6, \r7}
+ .endif
+ .if \head
+ lsl \r0, #8 + \shift
+ lsl \r1, #8 + \shift
+ lsl \r2, #8 + \shift
+ lsl \r3, #8 + \shift
+ .endif
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bxlo lr
+ push {v1-v6,sl,fp,lr}
+ .set IDX1, \channels
+ .set IDX2, \channels
+ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+0: beq 1f
+ .rept WORDS_PER_LOOP / 8
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+1:
+ .rept WORDS_PER_LOOP / 8 - 1
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
+ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ pop {v1-v6,sl,fp,pc}
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+
+.endif // mixed
+.else // not inorder
+.ifc \shift, mixed
+
+// This case not currently handled
+
+.else // not mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+CHAN0 .req v5
+CHAN1 .req v6
+CHAN2 .req sl
+CHAN3 .req fp
+CHAN4 .req ip
+CHAN5 .req lr
+
+ .macro output4words
+ .if \channels == 8
+ .if IDX1 == 8
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ .else
+ uxtb CHAN0, CHAN5, ror #0
+ uxtb CHAN1, CHAN5, ror #8
+ uxtb CHAN2, CHAN5, ror #16
+ uxtb CHAN3, CHAN5, ror #24
+ .endif
+ ldr DAT0, [IN, CHAN0, lsl #2]
+ ldr DAT1, [IN, CHAN1, lsl #2]
+ ldr DAT2, [IN, CHAN2, lsl #2]
+ ldr DAT3, [IN, CHAN3, lsl #2]
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ decr_modulo IDX1, 4, \channels
+ .else
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ .if SIZE_GROUP1 == 2
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ add IN, IN, #8*4
+ .else // SIZE_GROUP1 == 4
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP1, \channels
+ .if SIZE_GROUP2 == 2
+ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
+ .if IDX1 == 2
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP2, \channels
+ .endif
+ .if \channels == 8 // in this case we can corrupt CHAN0-3
+ rsb CHAN0, CHAN0, #8
+ rsb CHAN1, CHAN1, #8
+ rsb CHAN2, CHAN2, #8
+ rsb CHAN3, CHAN3, #8
+ lsl DAT0, #8 + \shift
+ lsl DAT1, #8 + \shift
+ lsl DAT2, #8 + \shift
+ lsl DAT3, #8 + \shift
+ eor CHECK, CHECK, DAT0, lsr CHAN0
+ eor CHECK, CHECK, DAT1, lsr CHAN1
+ eor CHECK, CHECK, DAT2, lsr CHAN2
+ eor CHECK, CHECK, DAT3, lsr CHAN3
+ .else
+ .if \shift != 0
+ lsl DAT0, #\shift
+ lsl DAT1, #\shift
+ lsl DAT2, #\shift
+ lsl DAT3, #\shift
+ .endif
+ bic DAT0, DAT0, #0xff000000
+ bic DAT1, DAT1, #0xff000000
+ bic DAT2, DAT2, #0xff000000
+ bic DAT3, DAT3, #0xff000000
+ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ lsl DAT0, #8
+ lsl DAT1, #8
+ lsl DAT2, #8
+ lsl DAT3, #8
+ .endif
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
+ ldr CHAN4, [CHAN0]
+ .if \channels == 2
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ .else
+ ldr CHAN5, [CHAN0, #4]
+ .if \channels == 6
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ uxtb CHAN4, CHAN5, ror #0
+ uxtb CHAN5, CHAN5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq CHAN0
+ .unreq CHAN1
+ .unreq CHAN2
+ .unreq CHAN3
+ .unreq CHAN4
+ .unreq CHAN5
+
+.endif // mixed
+.endif // inorder
+.endm // implement_pack
+
+.macro pack_channels inorder, channels
+ implement_pack \inorder, \channels, 0
+ implement_pack \inorder, \channels, 1
+ implement_pack \inorder, \channels, 2
+ implement_pack \inorder, \channels, 3
+ implement_pack \inorder, \channels, 4
+ implement_pack \inorder, \channels, 5
+ implement_pack \inorder, \channels, mixed
+.endm
+
+.macro pack_order inorder
+ pack_channels \inorder, 2
+ pack_channels \inorder, 6
+ pack_channels \inorder, 8
+.endm
+
+ pack_order 0
+ pack_order 1
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 268dfdd..e555b44 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -41,8 +41,79 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+#define DECLARE_PACK(order,channels,shift) \
+ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+#define ENUMERATE_PACK(order,channels,shift) \
+ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
+#define PACK_CHANNELS(macro,order,channels) \
+ macro(order,channels,0) \
+ macro(order,channels,1) \
+ macro(order,channels,2) \
+ macro(order,channels,3) \
+ macro(order,channels,4) \
+ macro(order,channels,5) \
+ macro(order,channels,mixed)
+#define PACK_ORDER(macro,order) \
+ PACK_CHANNELS(macro,order,2) \
+ PACK_CHANNELS(macro,order,6) \
+ PACK_CHANNELS(macro,order,8)
+#define PACK_ALL(macro) \
+ PACK_ORDER(macro,outof) \
+ PACK_ORDER(macro,in)
+PACK_ALL(DECLARE_PACK)
+
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
+
+static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ int ch_index;
+ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
+ int inorder = 1;
+ static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
+ PACK_ALL(ENUMERATE_PACK)
+ };
+ int i;
+
+ if (!is32) // don't support 16-bit output (it's not used by TrueHD)
+ return ff_mlp_pack_output;
+
+ switch (max_matrix_channel) {
+ case 1:
+ ch_index = 0;
+ break;
+ case 5:
+ ch_index = 1;
+ break;
+ case 7:
+ ch_index = 2;
+ break;
+ default:
+ return ff_mlp_pack_output;
+ }
+
+ for (i = 0; i <= max_matrix_channel; i++) {
+ if (shift != 6 && output_shift[i] != shift)
+ shift = 6; // indicate mixed shifts
+ if (ch_assign[i] != i)
+ inorder = 0;
+ }
+ if (shift == 6 && !inorder)
+ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
+
+ return routine[(inorder*3+ch_index)*7+shift];
+}
+
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
+ int cpu_flags = av_get_cpu_flags();
+
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
+ c->mlp_select_pack_output = mlp_select_pack_output_armv6;
}
--
1.7.5.4
Ben Avison
2014-03-20 18:58:35 UTC
Permalink
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
function in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%

Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/mlpdsp_arm.S | 433 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 +++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 474 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 8bdccbd..c6cc96e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,6 +21,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..f36ba54
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define MAX_CHANNELS 8
+#define MAX_FIR_ORDER 8
+#define MAX_IIR_ORDER 4
+#define MAX_RATEFACTOR 4
+#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
+
+PST .req a1
+PCO .req a2
+AC0 .req a3
+AC1 .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+ST0 .req v5
+ST1 .req v6
+ST2 .req sl
+ST3 .req fp
+I .req ip
+PSAMP .req lr
+
+
+// Some macros that do loads/multiplies where the register number is determined
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
+
+.macro load group, index, base, offset
+ .altmacro
+ load_ \group, %(\index), \base, \offset
+ .noaltmacro
+.endm
+
+.macro load_ group, index, base, offset
+ ldr \group\index, [\base, #\offset]
+.endm
+
+.macro loadd group, index, base, offset
+ .altmacro
+ loadd_ \group, %(\index), %(\index+1), \base, \offset
+ .noaltmacro
+.endm
+
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
+
+.macro multiply index, accumulate, long
+ .altmacro
+ multiply_ %(\index), \accumulate, \long
+ .noaltmacro
+.endm
+
+.macro multiply_ index, accumulate, long
+ .if \long
+ .if \accumulate
+ smlal AC0, AC1, CO\index, ST\index
+ .else
+ smull AC0, AC1, CO\index, ST\index
+ .endif
+ .else
+ .if \accumulate
+ mla AC0, CO\index, ST\index, AC0
+ .else
+ mul AC0, CO\index, ST\index
+ .endif
+ .endif
+.endm
+
+// A macro to update the load register number and load offsets
+
+.macro inc howmany
+ .set LOAD_REG, (LOAD_REG + \howmany) & 3
+ .set OFFSET_CO, OFFSET_CO + 4 * \howmany
+ .set OFFSET_ST, OFFSET_ST + 4 * \howmany
+ .if FIR_REMAIN > 0
+ .set FIR_REMAIN, FIR_REMAIN - \howmany
+ .if FIR_REMAIN == 0
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .endif
+ .elseif IIR_REMAIN > 0
+ .set IIR_REMAIN, IIR_REMAIN - \howmany
+ .endif
+.endm
+
+// Macro to implement the inner loop for one specific combination of parameters
+
+.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
+ .set TOTAL_TAPS, \iir_taps + \fir_taps
+
+ // Deal with register allocation...
+ .set DEFINED_SHIFT, 0
+ .set DEFINED_MASK, 0
+ .set SHUFFLE_SHIFT, 0
+ .set SHUFFLE_MASK, 0
+ .set SPILL_SHIFT, 0
+ .set SPILL_MASK, 0
+ .if TOTAL_TAPS == 0
+ // Little register pressure in this case - just keep MASK where it was
+ .if !\mask_minus1
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+ .else
+ .if \shift_0
+ .if !\mask_minus1
+ // AC1 is unused with shift 0
+ MASK .req AC1
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif \shift_8
+ .if !\mask_minus1
+ .if TOTAL_TAPS <= 4
+ // All coefficients are preloaded (so pointer not needed)
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .else
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .else // shift not 0 or 8
+ .if TOTAL_TAPS <= 3
+ // All coefficients are preloaded, and at least one CO register is unused
+ .if \fir_taps & 1
+ SHIFT .req CO0
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .else
+ SHIFT .req CO3
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .endif
+ .if !\mask_minus1
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif TOTAL_TAPS == 4
+ // All coefficients are preloaded
+ SHIFT .req PCO
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .else
+ .set SPILL_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .endif
+ .endif
+ .if SPILL_SHIFT
+ SHIFT .req ST0
+ .set DEFINED_SHIFT, 1
+ .endif
+ .if SPILL_MASK
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+
+ // Preload coefficients if possible
+ .if TOTAL_TAPS <= 4
+ .set OFFSET_CO, 0
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .rept \fir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .rept \iir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .endif
+
+ // Move mask/shift to final positions if necessary
+ // Need to do this after preloading, because in some cases we
+ // reuse the coefficient pointer register
+ .if SHUFFLE_SHIFT
+ mov SHIFT, ST0
+ .endif
+ .if SHUFFLE_MASK
+ mov MASK, ST1
+ .endif
+
+ // Begin loop
+01:
+ .if TOTAL_TAPS == 0
+ // Things simplify a lot in this case
+ // In fact this could be pipelined further if it's worth it...
+ ldr ST0, [PSAMP]
+ subs I, I, #1
+ .if !\mask_minus1
+ and ST0, ST0, MASK
+ .endif
+ str ST0, [PST, #-4]!
+ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST0, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .else
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .set LOAD_BANK, 0
+ .set FIR_REMAIN, \fir_taps
+ .set IIR_REMAIN, \iir_taps
+ .if FIR_REMAIN == 0 // only IIR terms
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .else
+ .set OFFSET_CO, 0
+ .set OFFSET_ST, 0
+ .endif
+ .set MUL_REG, LOAD_REG
+ .set COUNTER, 0
+ .rept TOTAL_TAPS + 2
+ // Do load(s)
+ .if FIR_REMAIN != 0 || IIR_REMAIN != 0
+ .if COUNTER == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif COUNTER == 1 && (\fir_taps & 1) == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif LOAD_BANK == 0
+ .if TOTAL_TAPS > 4
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .else
+ loadd CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ .endif
+ .set LOAD_BANK, 1
+ .else
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .else
+ loadd ST, LOAD_REG, PST, OFFSET_ST
+ inc 2
+ .endif
+ .set LOAD_BANK, 0
+ .endif
+ .endif
+
+ // Do interleaved multiplies, slightly delayed
+ .if COUNTER >= 2
+ multiply MUL_REG, COUNTER > 2, !\shift_0
+ .set MUL_REG, (MUL_REG + 1) & 3
+ .endif
+ .set COUNTER, COUNTER + 1
+ .endr
+
+ // Post-process the result of the multiplies
+ .if SPILL_SHIFT
+ ldr SHIFT, [sp, #9*4 + 0*4]
+ .endif
+ .if SPILL_MASK
+ ldr MASK, [sp, #9*4 + 1*4]
+ .endif
+ ldr ST2, [PSAMP]
+ subs I, I, #1
+ .if \shift_8
+ mov AC0, AC0, lsr #8
+ orr AC0, AC0, AC1, lsl #24
+ .elseif !\shift_0
+ rsb ST3, SHIFT, #32
+ mov AC0, AC0, lsr SHIFT
+A orr AC0, AC0, AC1, lsl ST3
+T mov AC1, AC1, lsl ST3
+T orr AC0, AC0, AC1
+ .endif
+ .if \mask_minus1
+ add ST3, ST2, AC0
+ .else
+ add ST2, ST2, AC0
+ and ST3, ST2, MASK
+ sub ST2, ST3, AC0
+ .endif
+ str ST3, [PST, #-4]!
+ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST3, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .endif
+ b 99f
+
+ .if DEFINED_SHIFT
+ .unreq SHIFT
+ .endif
+ .if DEFINED_MASK
+ .unreq MASK
+ .endif
+.endm
+
+.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
+A ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
+T tbh [pc, a3, lsl #1]
+0:
+A .word 0, 70f, 71f, 72f, 73f, 74f
+T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2
+ .if \iir_taps <= 3
+A .word 75f
+T .hword (75f - 0b) / 2
+ .if \iir_taps <= 2
+A .word 76f
+T .hword (76f - 0b) / 2
+ .if \iir_taps <= 1
+A .word 77f
+T .hword (77f - 0b) / 2
+ .if \iir_taps == 0
+A .word 78f
+T .hword (78f - 0b) / 2
+ .endif
+ .endif
+ .endif
+ .endif
+70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
+71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
+72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
+73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
+74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
+ .if \iir_taps <= 3
+75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
+ .if \iir_taps <= 2
+76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
+ .if \iir_taps <= 1
+77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
+ .if \iir_taps == 0
+78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
+ .endif
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
+A ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4
+T tbh [pc, a4, lsl #1]
+0:
+A .word 0, 60f, 61f, 62f, 63f, 64f
+T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2
+60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
+61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
+62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
+63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
+64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
+.endm
+
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ * int firorder, int iirorder,
+ * unsigned int filter_shift, int32_t mask,
+ * int blocksize, int32_t *sample_buffer);
+ */
+function ff_mlp_filter_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {ST0,ST1,I,PSAMP}
+ cmp ST1, #-1
+ bne 30f
+ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 20f
+ bcs 10f
+ switch_on_iir_taps 1, 1, 0
+10: switch_on_iir_taps 1, 0, 1
+20: switch_on_iir_taps 1, 0, 0
+30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 50f
+ bcs 40f
+ switch_on_iir_taps 0, 1, 0
+40: switch_on_iir_taps 0, 0, 1
+50: switch_on_iir_taps 0, 0, 0
+99: pop {v1-fp,pc}
+endfunc
+
+ .unreq PST
+ .unreq PCO
+ .unreq AC0
+ .unreq AC1
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq ST0
+ .unreq ST1
+ .unreq ST2
+ .unreq ST3
+ .unreq I
+ .unreq PSAMP
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
new file mode 100644
index 0000000..f0ea285
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mlpdsp.h"
+
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer);
+
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+{
+ c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index b96200e..151cf83 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ if (ARCH_ARM)
+ ff_mlpdsp_init_arm(c);
if (ARCH_X86)
ff_mlpdsp_init_x86(c);
}
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index 995f72a..c985a17 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
void ff_mlpdsp_init_x86(MLPDSPContext *c);

#endif /* AVCODEC_MLPDSP_H */
--
1.7.5.4
Martin Storsjö
2014-03-24 12:10:52 UTC
Permalink
Post by Ben Avison
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%
Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/mlpdsp_arm.S | 433 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 +++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 474 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..f36ba54
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
Any particular reason the .if uses 'offset' and not '\offset'?
gas-preprocessor can't handle this correctly at the moment.

// Martin
Ben Avison
2014-03-24 16:00:15 UTC
Permalink
Post by Ben Avison
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
Any particular reason the .if uses 'offset' and not '\offset'? gas-
preprocessor can't handle this correctly at the moment.
That was a typo. However, it's one that gas seems to be quite happy to
accept - in particular, it seems to match symbols lacking the leading \
against a macro's parameter names iff .altmacro mode was engaged at the
point when that line was encountered (the .altmacro state when the
macro's parameters are evaluated seems to be inconsequential for this
purpose). In this specific case, .altmacro needs to be active when the
loadd_ macro is invoked, in order for the parameter expressions to be
stringised, and the body of the macro happens to leave the .altmacro
state unchanged, hence the fact that it accepts parameter names without
the \.

Of course, the above is determined experimentally, but the GAS
documentation is very unclear on exactly what .altmacro does and the way
macro expansions are performed (note to the maintainers, in case they
ever read this: a handful of examples do not constitute a specification).

Presumably having gas-preprocessor handle .altmacro state to match gas
would be desirable long-term anyway, in case you ever encounter more
substantial blocks of code written to .altmacro conventions? Perhaps I
should really have consistently omitted the \ for code that was intended
to be assembled in .altmacro state, to make it more obvious whether it
was active or not?

Ben
Martin Storsjö
2014-03-24 17:25:38 UTC
Permalink
Post by Ben Avison
Post by Ben Avison
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
Any particular reason the .if uses 'offset' and not '\offset'? gas-
preprocessor can't handle this correctly at the moment.
That was a typo. However, it's one that gas seems to be quite happy to
accept - in particular, it seems to match symbols lacking the leading \
against a macro's parameter names iff .altmacro mode was engaged at the
point when that line was encountered (the .altmacro state when the
macro's parameters are evaluated seems to be inconsequential for this
purpose). In this specific case, .altmacro needs to be active when the
loadd_ macro is invoked, in order for the parameter expressions to be
stringised, and the body of the macro happens to leave the .altmacro
state unchanged, hence the fact that it accepts parameter names without
the \.
Of course, the above is determined experimentally, but the GAS
documentation is very unclear on exactly what .altmacro does and the way
macro expansions are performed (note to the maintainers, in case they
ever read this: a handful of examples do not constitute a specification).
Presumably having gas-preprocessor handle .altmacro state to match gas
would be desirable long-term anyway, in case you ever encounter more
substantial blocks of code written to .altmacro conventions?
Sure - adding support for handling macro arguments without backslashes
seems to be quite easy.
Post by Ben Avison
Perhaps I should really have consistently omitted the \ for code that
was intended to be assembled in .altmacro state, to make it more obvious
whether it was active or not?
Not sure if it would be worth intentionally changing it like that given
that these details are undocumented.

// Martin
Ben Avison
2014-03-20 18:58:34 UTC
Permalink
An updated patch series. The main difference here is that for Thumb targets,
it's assumed that interworking is not supported, so individual functions are
either assembled as Thumb, or omitted if they cannot be supported without a
major refactoring.

Ben Avison (6):
truehd: add hand-scheduled ARM asm version of mlp_filter_channel.
truehd: break out part of rematrix_channels into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of
ff_mlp_rematrix_channel.
truehd: tune VLC decoding for ARM.
truehd: break out part of output_data into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of ff_mlp_pack_output.

libavcodec/arm/Makefile | 3 +
libavcodec/arm/mlpdsp_arm.S | 655 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_armv6.S | 530 ++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 144 +++++++++
libavcodec/mlpdec.c | 90 +++---
libavcodec/mlpdsp.c | 73 +++++
libavcodec/mlpdsp.h | 46 +++
7 files changed, 1496 insertions(+), 45 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
--
1.7.5.4
Ben Avison
2014-03-20 18:58:38 UTC
Permalink
Profiling on a Raspberry Pi revealed the best performance to correspond
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
---
libavcodec/mlpdec.c | 13 ++++++++++---
1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index c0f2d6a..b9d1704 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -37,9 +37,16 @@
#include "mlp_parser.h"
#include "mlpdsp.h"
#include "mlp.h"
+#include "config.h"

/** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
+#define VLC_BITS 5
+#define VLC_STATIC_SIZE 64
+#else
#define VLC_BITS 9
+#define VLC_STATIC_SIZE 512
+#endif

typedef struct SubStream {
/// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
@@ -190,13 +197,13 @@ static av_cold void init_static(void)
if (!huff_vlc[0].bits) {
INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
&ff_mlp_huffman_tables[0][0][1], 2, 1,
- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
&ff_mlp_huffman_tables[1][0][1], 2, 1,
- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
&ff_mlp_huffman_tables[2][0][1], 2, 1,
- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
}

ff_mlp_init_crc();
--
1.7.5.4
Martin Storsjö
2014-03-24 19:43:28 UTC
Permalink
Post by Ben Avison
Profiling on a Raspberry Pi revealed the best performance to correspond
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
---
libavcodec/mlpdec.c | 13 ++++++++++---
1 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index c0f2d6a..b9d1704 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -37,9 +37,16 @@
#include "mlp_parser.h"
#include "mlpdsp.h"
#include "mlp.h"
+#include "config.h"
/** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
FWIW, this can be written as "#if ARCH_ARM".

// Martin
Ben Avison
2014-03-20 18:58:36 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index ed5a6ac..c0f2d6a 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -978,7 +978,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
{
SubStream *s = &m->substream[substr];
- unsigned int mat, src_ch, i;
+ unsigned int mat;
unsigned int maxchan;

maxchan = s->max_matrix_channel;
@@ -990,31 +990,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
}

for (mat = 0; mat < s->num_primitive_matrices; mat++) {
- int matrix_noise_shift = s->matrix_noise_shift[mat];
unsigned int dest_ch = s->matrix_out_ch[mat];
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
- int32_t *coeffs = s->matrix_coeff[mat];
- int index = s->num_primitive_matrices - mat;
- int index2 = 2 * index + 1;
-
- /* TODO: DSPContext? */
-
- for (i = 0; i < s->blockpos; i++) {
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
- int32_t *samples = m->sample_buffer[i];
- int64_t accum = 0;
-
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
-
- if (matrix_noise_shift) {
- index &= m->access_unit_size_pow2 - 1;
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
- index += index2;
- }
-
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
- }
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+ s->matrix_coeff[mat],
+ &m->bypassed_lsbs[0][mat],
+ m->noise_buffer,
+ s->num_primitive_matrices - mat,
+ dest_ch,
+ s->blockpos,
+ maxchan,
+ s->matrix_noise_shift[mat],
+ m->access_unit_size_pow2,
+ MSB_MASK(s->quant_step_size[dest_ch]));
}
}

diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 151cf83..dfa13af 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask)
+{
+ unsigned int src_ch, i;
+ int index2 = 2 * index + 1;
+ for (i = 0; i < blockpos; i++) {
+ int64_t accum = 0;
+
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+ if (matrix_noise_shift) {
+ index &= access_unit_size_pow2 - 1;
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
+ index += index2;
+ }
+
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+ bypassed_lsbs += MAX_CHANNELS;
+ samples += MAX_CHANNELS;
+ }
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index c985a17..bd864d9 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@

#include <stdint.h>

+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Ben Avison
2014-03-20 18:58:37 UTC
Permalink
Profiling results for overall audio decode and the rematrix_channels function
in particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3%
6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant)
8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant)
8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant)
6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9%
6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3%
8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9%
8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/mlpdsp_arm.S | 222 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 12 ++
2 files changed, 234 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
index f36ba54..08cc2d0 100644
--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -431,3 +431,225 @@ endfunc
.unreq ST3
.unreq I
.unreq PSAMP
+
+/********************************************************************/
+
+PSA .req a1 // samples
+PCO .req a2 // coeffs
+PBL .req a3 // bypassed_lsbs
+INDEX .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+SA0 .req v5
+SA1 .req v6
+SA2 .req sl
+SA3 .req fp
+AC0 .req ip
+AC1 .req lr
+NOISE .req SA0
+LSB .req SA1
+DCH .req SA2 // dest_ch
+MASK .req SA3
+
+ // INDEX is used as follows:
+ // bits 0..6 index2 (values up to 17, but wider so that we can
+ // add to index field without needing to mask)
+ // bits 7..14 i (values up to 160)
+ // bit 15 underflow detect for i
+ // bits 25..31 (if access_unit_size_pow2 == 128) \ index
+ // bits 26..31 (if access_unit_size_pow2 == 64) /
+
+.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
+ .if \maxchan == 1
+ // We can just leave the coefficients in registers in this case
+ ldrd CO0, CO1, [PCO]
+ .endif
+1:
+ .if \maxchan == 1
+ ldrd SA0, SA1, [PSA]
+ smull AC0, AC1, CO0, SA0
+ .elseif \maxchan == 5
+ ldr CO0, [PCO, #0]
+ ldr SA0, [PSA, #0]
+ ldr CO1, [PCO, #4]
+ ldr SA1, [PSA, #4]
+ ldrd CO2, CO3, [PCO, #8]
+ smull AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #8]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #16]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #16]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .else // \maxchan == 7
+ ldr CO2, [PCO, #0]
+ ldr SA2, [PSA, #0]
+ ldr CO3, [PCO, #4]
+ ldr SA3, [PSA, #4]
+ ldrd CO0, CO1, [PCO, #8]
+ smull AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #8]
+ smlal AC0, AC1, CO3, SA3
+ ldrd CO2, CO3, [PCO, #16]
+ smlal AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #16]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #24]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #24]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .endif
+ ldm sp, {NOISE, DCH, MASK}
+ smlal AC0, AC1, CO1, SA1
+ .if \shift != 0
+ .if \index_mask == 63
+ add NOISE, NOISE, INDEX, lsr #32-6
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-6
+ .else // \index_mask == 127
+ add NOISE, NOISE, INDEX, lsr #32-7
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-7
+ .endif
+ sub INDEX, INDEX, #1<<7
+ adds AC0, AC0, NOISE, lsl #\shift + 7
+ adc AC1, AC1, NOISE, asr #31
+ .else
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ sub INDEX, INDEX, #1<<7
+ .endif
+ add PSA, PSA, #MAX_CHANNELS*4
+ mov AC0, AC0, lsr #14
+ orr AC0, AC0, AC1, lsl #18
+ .if !\mask_minus1
+ and AC0, AC0, MASK
+ .endif
+ add AC0, AC0, LSB
+ tst INDEX, #1<<15
+ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
+ beq 1b
+ b 98f
+.endm
+
+.macro switch_on_maxchan shift, index_mask, mask_minus1
+ cmp v4, #5
+ blo 51f
+ beq 50f
+ implement_rematrix \shift, \index_mask, \mask_minus1, 7
+50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
+51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
+.endm
+
+.macro switch_on_mask shift, index_mask
+ cmp sl, #-1
+ bne 40f
+ switch_on_maxchan \shift, \index_mask, 1
+40: switch_on_maxchan \shift, \index_mask, 0
+.endm
+
+.macro switch_on_au_size shift
+ .if \shift == 0
+ switch_on_mask \shift, undefined
+ .else
+ teq v6, #64
+ bne 30f
+ orr INDEX, INDEX, v1, lsl #32-6
+ switch_on_mask \shift, 63
+30: orr INDEX, INDEX, v1, lsl #32-7
+ switch_on_mask \shift, 127
+ .endif
+.endm
+
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ * const int32_t *coeffs,
+ * const uint8_t *bypassed_lsbs,
+ * const int8_t *noise_buffer,
+ * int index,
+ * unsigned int dest_ch,
+ * uint16_t blockpos,
+ * unsigned int maxchan,
+ * int matrix_noise_shift,
+ * int access_unit_size_pow2,
+ * int32_t mask);
+ */
+function ff_mlp_rematrix_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {v1-sl}
+ teq v4, #1
+ itt ne
+ teqne v4, #5
+ teqne v4, #7
+ bne 99f
+ teq v6, #64
+ it ne
+ teqne v6, #128
+ bne 99f
+ sub v2, v2, #MAX_CHANNELS
+ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
+ movs INDEX, v3, lsl #7
+ beq 98f // just in case, do nothing if blockpos = 0
+ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
+ adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
+ orr INDEX, INDEX, lr
+ // Switch on matrix_noise_shift: values 0 and 1 are
+ // disproportionately common so do those in a form the branch
+ // predictor can accelerate. Values can only go up to 15.
+ cmp v5, #1
+ beq 11f
+ blo 10f
+A ldr pc, [pc, v5, lsl #2]
+T tbh [pc, v5, lsl #1]
+0:
+A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
+T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
+T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
+T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
+10: switch_on_au_size 0
+11: switch_on_au_size 1
+12: switch_on_au_size 2
+13: switch_on_au_size 3
+14: switch_on_au_size 4
+15: switch_on_au_size 5
+16: switch_on_au_size 6
+17: switch_on_au_size 7
+18: switch_on_au_size 8
+19: switch_on_au_size 9
+20: switch_on_au_size 10
+21: switch_on_au_size 11
+22: switch_on_au_size 12
+23: switch_on_au_size 13
+24: switch_on_au_size 14
+25: switch_on_au_size 15
+
+98: add sp, sp, #3*4
+ pop {v1-fp,pc}
+99: // Can't handle these parameters, drop back to C
+ pop {v1-fp,lr}
+ b X(ff_mlp_rematrix_channel)
+endfunc
+
+ .unreq PSA
+ .unreq PCO
+ .unreq PBL
+ .unreq INDEX
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq SA0
+ .unreq SA1
+ .unreq SA2
+ .unreq SA3
+ .unreq AC0
+ .unreq AC1
+ .unreq NOISE
+ .unreq LSB
+ .unreq DCH
+ .unreq MASK
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index f0ea285..268dfdd 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);

av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
}
--
1.7.5.4
Ben Avison
2014-03-20 18:58:39 UTC
Permalink
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
3 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index b9d1704..49353d9 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -360,6 +360,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
else
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
+ m->substream[m->max_decoded_substream].output_shift,
+ m->substream[m->max_decoded_substream].max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);

m->params_valid = 1;
for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
@@ -588,6 +592,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
if (substr == m->max_decoded_substream) {
m->avctx->channels = s->max_matrix_channel + 1;
m->avctx->channel_layout = s->ch_layout;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
}

return 0;
@@ -818,9 +826,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
return ret;

if (s->param_presence_flags & PARAM_OUTSHIFT)
- if (get_bits1(gbp))
+ if (get_bits1(gbp)) {
for (ch = 0; ch <= s->max_matrix_channel; ch++)
s->output_shift[ch] = get_sbits(gbp, 4);
+ if (substr == m->max_decoded_substream)
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+ }

if (s->param_presence_flags & PARAM_QUANTSTEP)
if (get_bits1(gbp))
@@ -1019,9 +1033,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
{
AVCodecContext *avctx = m->avctx;
SubStream *s = &m->substream[substr];
- unsigned int i, out_ch = 0;
- int32_t *data_32;
- int16_t *data_16;
int ret;
int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);

@@ -1041,19 +1052,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
return ret;
}
- data_32 = (int32_t *)frame->data[0];
- data_16 = (int16_t *)frame->data[0];
-
- for (i = 0; i < s->blockpos; i++) {
- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
- int mat_ch = s->ch_assign[out_ch];
- int32_t sample = m->sample_buffer[i][mat_ch]
- << s->output_shift[mat_ch];
- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
- if (is32) *data_32++ = sample << 8;
- else *data_16++ = sample >> 8;
- }
- }
+ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
+ s->blockpos,
+ m->sample_buffer,
+ frame->data[0],
+ s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ is32);

/* Update matrix encoding side data */
if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index dfa13af..aded554 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples,
}
}

+static int32_t (*mlp_select_pack_output(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ return ff_mlp_pack_output;
+}
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32)
+{
+ unsigned int i, out_ch = 0;
+ int32_t *data_32 = data;
+ int16_t *data_16 = data;
+
+ for (i = 0; i < blockpos; i++) {
+ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
+ int mat_ch = ch_assign[out_ch];
+ int32_t sample = sample_buffer[i][mat_ch]
+ << output_shift[mat_ch];
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32)
+ *data_32++ = sample << 8;
+ else
+ *data_16++ = sample >> 8;
+ }
+ }
+ return lossless_check_data;
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+ c->mlp_select_pack_output = mlp_select_pack_output;
+ c->mlp_pack_output = ff_mlp_pack_output;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index bd864d9..acd48fc 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -23,6 +23,7 @@
#define AVCODEC_MLPDSP_H

#include <stdint.h>
+#include "mlp.h"

void ff_mlp_rematrix_channel(int32_t *samples,
const int32_t *coeffs,
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
+ int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
} MLPDSPContext;

void ff_mlpdsp_init(MLPDSPContext *c);
--
1.7.5.4
Martin Storsjö
2014-03-25 16:25:57 UTC
Permalink
Post by Ben Avison
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
3 files changed, 83 insertions(+), 17 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index b9d1704..49353d9 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -360,6 +360,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
else
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
+ m->substream[m->max_decoded_substream].output_shift,
+ m->substream[m->max_decoded_substream].max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
I'm not sure if this is the right way to do this, or if the
mlp_packet_output function pointer should be in MLPDecodeContext instead?
Practically it probably doesn't matter though.

// Martin
Ben Avison
2014-03-20 18:58:40 UTC
Permalink
Profiling results for overall decode and the output_data function in
particular are as follows:

Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%

The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/mlpdsp_armv6.S | 530 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++
3 files changed, 627 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_armv6.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index c6cc96e..8d7e25a 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -51,6 +51,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
+ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
new file mode 100644
index 0000000..7f41a15
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <***@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro loadregoffsh2 group, index, base, offgroup, offindex
+ .altmacro
+ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
+ .noaltmacro
+.endm
+
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
+ ldr \group\index, [\base, \offgroup\offindex, lsl #2]
+.endm
+
+.macro eorlslreg check, data, group, index
+ .altmacro
+ eorlslreg_ \check, \data, \group, %(\index)
+ .noaltmacro
+.endm
+
+.macro eorlslreg_ check, data, group, index
+ eor \check, \check, \data, lsl \group\index
+.endm
+
+.macro decr_modulo var, by, modulus
+ .set \var, \var - \by
+ .if \var == 0
+ .set \var, \modulus
+ .endif
+.endm
+
+ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
+ .else // size == 4
+ .if IDX1 > 4 || \channels==8
+ ldm IN!, {\r0, \r1, \r2, \r3}
+ .else
+ ldm IN, {\r0, \r1, \r2, \r3}
+ .if !\pointer_dead
+ add IN, IN, #(4 + 8 - \channels) * 4
+ .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ .if IDX1 > 2
+ ldm IN!, {\r2, \r3}
+ .else
+//A .ifc \r2, ip
+//A .if \pointer_dead
+//A ldm IN, {\r2, \r3}
+//A .else
+//A ldr \r2, [IN], #4
+//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
+//A .endif
+//A .else
+ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
+//A .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+.macro implement_pack inorder, channels, shift
+.if \inorder
+.ifc \shift, mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+SHIFT0 .req v5
+SHIFT1 .req v6
+SHIFT2 .req sl
+SHIFT3 .req fp
+SHIFT4 .req ip
+SHIFT5 .req lr
+
+ .macro output4words
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
+ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
+ .if \channels == 2
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .elseif \channels == 6
+ .if IDX2 == 6
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .elseif IDX2 == 2
+ lsl DAT0, SHIFT4
+ lsl DAT1, SHIFT5
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .else // IDX2 == 4
+ lsl DAT0, SHIFT2
+ lsl DAT1, SHIFT3
+ lsl DAT2, SHIFT4
+ lsl DAT3, SHIFT5
+ .endif
+ .elseif \channels == 8
+ .if IDX2 == 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ .else
+ uxtb SHIFT0, SHIFT5, ror #0
+ uxtb SHIFT1, SHIFT5, ror #8
+ uxtb SHIFT2, SHIFT5, ror #16
+ uxtb SHIFT3, SHIFT5, ror #24
+ .endif
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .endif
+ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ it eq
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
+ ldr SHIFT1, =0x08080808
+ ldr SHIFT4, [SHIFT0]
+ .if \channels == 2
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ .else
+ ldr SHIFT5, [SHIFT0, #4]
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uadd8 SHIFT5, SHIFT5, SHIFT1
+ .if \channels == 6
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ uxtb SHIFT4, SHIFT5, ror #0
+ uxtb SHIFT5, SHIFT5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq SHIFT0
+ .unreq SHIFT1
+ .unreq SHIFT2
+ .unreq SHIFT3
+ .unreq SHIFT4
+ .unreq SHIFT5
+
+.else // not mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+DAT4 .req v5
+DAT5 .req v6
+DAT6 .req sl // use these rather than the otherwise unused
+DAT7 .req fp // ip and lr so that we can load them usinf LDRD
+
+ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
+ .if \head
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ .endif
+ .if \head
+ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {\r4, \r5, \r6, \r7}
+ .endif
+ .if \head
+ lsl \r0, #8 + \shift
+ lsl \r1, #8 + \shift
+ lsl \r2, #8 + \shift
+ lsl \r3, #8 + \shift
+ .endif
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ it lo
+ bxlo lr
+ push {v1-v6,sl,fp,lr}
+ .set IDX1, \channels
+ .set IDX2, \channels
+ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+0: beq 1f
+ .rept WORDS_PER_LOOP / 8
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+1:
+ .rept WORDS_PER_LOOP / 8 - 1
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
+ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ pop {v1-v6,sl,fp,pc}
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+
+.endif // mixed
+.else // not inorder
+.ifc \shift, mixed
+
+// This case not currently handled
+
+.else // not mixed
+
+#if !CONFIG_THUMB
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+CHAN0 .req v5
+CHAN1 .req v6
+CHAN2 .req sl
+CHAN3 .req fp
+CHAN4 .req ip
+CHAN5 .req lr
+
+ .macro output4words
+ .if \channels == 8
+ .if IDX1 == 8
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ .else
+ uxtb CHAN0, CHAN5, ror #0
+ uxtb CHAN1, CHAN5, ror #8
+ uxtb CHAN2, CHAN5, ror #16
+ uxtb CHAN3, CHAN5, ror #24
+ .endif
+ ldr DAT0, [IN, CHAN0, lsl #2]
+ ldr DAT1, [IN, CHAN1, lsl #2]
+ ldr DAT2, [IN, CHAN2, lsl #2]
+ ldr DAT3, [IN, CHAN3, lsl #2]
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ decr_modulo IDX1, 4, \channels
+ .else
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ .if SIZE_GROUP1 == 2
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ add IN, IN, #8*4
+ .else // SIZE_GROUP1 == 4
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP1, \channels
+ .if SIZE_GROUP2 == 2
+ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
+ .if IDX1 == 2
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP2, \channels
+ .endif
+ .if \channels == 8 // in this case we can corrupt CHAN0-3
+ rsb CHAN0, CHAN0, #8
+ rsb CHAN1, CHAN1, #8
+ rsb CHAN2, CHAN2, #8
+ rsb CHAN3, CHAN3, #8
+ lsl DAT0, #8 + \shift
+ lsl DAT1, #8 + \shift
+ lsl DAT2, #8 + \shift
+ lsl DAT3, #8 + \shift
+ eor CHECK, CHECK, DAT0, lsr CHAN0
+ eor CHECK, CHECK, DAT1, lsr CHAN1
+ eor CHECK, CHECK, DAT2, lsr CHAN2
+ eor CHECK, CHECK, DAT3, lsr CHAN3
+ .else
+ .if \shift != 0
+ lsl DAT0, #\shift
+ lsl DAT1, #\shift
+ lsl DAT2, #\shift
+ lsl DAT3, #\shift
+ .endif
+ bic DAT0, DAT0, #0xff000000
+ bic DAT1, DAT1, #0xff000000
+ bic DAT2, DAT2, #0xff000000
+ bic DAT3, DAT3, #0xff000000
+ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ lsl DAT0, #8
+ lsl DAT1, #8
+ lsl DAT2, #8
+ lsl DAT3, #8
+ .endif
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ it eq
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
+ ldr CHAN4, [CHAN0]
+ .if \channels == 2
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ .else
+ ldr CHAN5, [CHAN0, #4]
+ .if \channels == 6
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ uxtb CHAN4, CHAN5, ror #0
+ uxtb CHAN5, CHAN5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq CHAN0
+ .unreq CHAN1
+ .unreq CHAN2
+ .unreq CHAN3
+ .unreq CHAN4
+ .unreq CHAN5
+
+#endif // !CONFIG_THUMB
+
+.endif // mixed
+.endif // inorder
+.endm // implement_pack
+
+.macro pack_channels inorder, channels
+ implement_pack \inorder, \channels, 0
+ implement_pack \inorder, \channels, 1
+ implement_pack \inorder, \channels, 2
+ implement_pack \inorder, \channels, 3
+ implement_pack \inorder, \channels, 4
+ implement_pack \inorder, \channels, 5
+ implement_pack \inorder, \channels, mixed
+.endm
+
+.macro pack_order inorder
+ pack_channels \inorder, 2
+ pack_channels \inorder, 6
+ pack_channels \inorder, 8
+.endm
+
+ pack_order 0
+ pack_order 1
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 268dfdd..20a5c06 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -41,8 +41,104 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);

+#define DECLARE_PACK(order,channels,shift) \
+ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+#define ENUMERATE_PACK(order,channels,shift) \
+ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
+#define PACK_CHANNELS(macro,order,channels) \
+ macro(order,channels,0) \
+ macro(order,channels,1) \
+ macro(order,channels,2) \
+ macro(order,channels,3) \
+ macro(order,channels,4) \
+ macro(order,channels,5) \
+ macro(order,channels,mixed)
+#define PACK_ORDER(macro,order) \
+ PACK_CHANNELS(macro,order,2) \
+ PACK_CHANNELS(macro,order,6) \
+ PACK_CHANNELS(macro,order,8)
+#define PACK_ALL(macro) \
+ PACK_ORDER(macro,outof) \
+ PACK_ORDER(macro,in)
+PACK_ALL(DECLARE_PACK)
+
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
+#if CONFIG_THUMB
+#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
+#endif
+
+static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ int ch_index;
+ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
+ int inorder = 1;
+ static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
+ PACK_ALL(ENUMERATE_PACK)
+ };
+ int i;
+
+ if (!is32) // don't support 16-bit output (it's not used by TrueHD)
+ return ff_mlp_pack_output;
+
+ switch (max_matrix_channel) {
+ case 1:
+ ch_index = 0;
+ break;
+ case 5:
+ ch_index = 1;
+ break;
+ case 7:
+ ch_index = 2;
+ break;
+ default:
+ return ff_mlp_pack_output;
+ }
+
+ for (i = 0; i <= max_matrix_channel; i++) {
+ if (shift != 6 && output_shift[i] != shift)
+ shift = 6; // indicate mixed shifts
+ if (ch_assign[i] != i)
+ inorder = 0;
+ }
+#if CONFIG_THUMB
+ if (!inorder)
+ return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
+#else
+ if (shift == 6 && !inorder)
+ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
+#endif
+
+ return routine[(inorder*3+ch_index)*7+shift];
+}
+
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
+ int cpu_flags = av_get_cpu_flags();
+
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
+ c->mlp_select_pack_output = mlp_select_pack_output_armv6;
}
--
1.7.5.4
Martin Storsjö
2014-03-24 12:12:47 UTC
Permalink
Post by Ben Avison
Profiling results for overall decode and the output_data function in
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/mlpdsp_armv6.S | 530 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++
3 files changed, 627 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the "it ne"
in order to be able to build it in thumb mode.

// Martin
Janne Grunau
2014-03-24 12:41:33 UTC
Permalink
Post by Martin Storsjö
Post by Ben Avison
Profiling results for overall decode and the output_data function in
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/mlpdsp_armv6.S | 530 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++
3 files changed, 627 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the
"it ne" in order to be able to build it in thumb mode.
Does it fail to build? branch is the only instruction with conditional
encodings in thumb mode. The encodings which require an 'it' only allow
a larger immediate offset.

Janne
Martin Storsjö
2014-03-24 12:49:34 UTC
Permalink
Post by Janne Grunau
Post by Martin Storsjö
Post by Ben Avison
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the
"it ne" in order to be able to build it in thumb mode.
Does it fail to build? branch is the only instruction with conditional
encodings in thumb mode. The encodings which require an 'it' only allow
a larger immediate offset.
Yes, it fails to build. Building with apple tools fails simply like this:

libavcodec/arm/mlpdsp_armv6.S:-915:9: error: unsupported relocation on symbol
bne _ff_mlp_pack_output

Building with proper binutils on linux fails at the linking stage:

libavcodec/libavcodec.a(mlpdsp_armv6.o): In function `ff_mlp_pack_output_inorder_2ch_0shift_armv6':
libavcodec/arm/mlpdsp_armv6.S:353:(.text+0x4): relocation truncated to fit:
R_ARM_THM_JUMP19 against symbol `ff_mlp_pack_output' defined in .text
section in libavcodec/libavcodec.a(mlpdsp.o)

// Martin
Janne Grunau
2014-03-24 13:33:30 UTC
Permalink
Post by Martin Storsjö
Post by Janne Grunau
Post by Martin Storsjö
Post by Ben Avison
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the
"it ne" in order to be able to build it in thumb mode.
Does it fail to build? branch is the only instruction with conditional
encodings in thumb mode. The encodings which require an 'it' only allow
a larger immediate offset.
libavcodec/arm/mlpdsp_armv6.S:-915:9: error: unsupported relocation on symbol
bne _ff_mlp_pack_output
R_ARM_THM_JUMP19 against symbol `ff_mlp_pack_output' defined in
.text section in libavcodec/libavcodec.a(mlpdsp.o)
ok, the label is out of range and the additional 4 bits for the offset
we gain by using 'it' fixes it. I guess it could be either fixed by
making sure arm/mlpdsp_armv6.o and mlpdsp.o are placed close together
in the link command, using 'it' or not branching into the C function.
Enforcing an order on the link command is unfortunately quite hard since
we rely on $(sort) to filter out duplicates. I guess the easiest fix is
using 'it'.

Janne
Martin Storsjö
2014-03-24 13:37:08 UTC
Permalink
Post by Janne Grunau
Post by Martin Storsjö
Post by Janne Grunau
Post by Martin Storsjö
Post by Ben Avison
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the
"it ne" in order to be able to build it in thumb mode.
Does it fail to build? branch is the only instruction with conditional
encodings in thumb mode. The encodings which require an 'it' only allow
a larger immediate offset.
libavcodec/arm/mlpdsp_armv6.S:-915:9: error: unsupported relocation on symbol
bne _ff_mlp_pack_output
R_ARM_THM_JUMP19 against symbol `ff_mlp_pack_output' defined in
.text section in libavcodec/libavcodec.a(mlpdsp.o)
ok, the label is out of range and the additional 4 bits for the offset
we gain by using 'it' fixes it. I guess it could be either fixed by
making sure arm/mlpdsp_armv6.o and mlpdsp.o are placed close together
in the link command, using 'it' or not branching into the C function.
Enforcing an order on the link command is unfortunately quite hard since
we rely on $(sort) to filter out duplicates. I guess the easiest fix is
using 'it'.
Yes, and in the apple/iOS case, it even fails at compile time before the
actual distance is known - apparently it doesn't support relocations
in conditional thumb branches at all, so moving them closer in the link
wouldn't help.

// Martin
Ben Avison
2014-03-24 16:17:15 UTC
Permalink
Post by Martin Storsjö
Post by Janne Grunau
Post by Martin Storsjö
Post by Ben Avison
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
This (and a few similar occurrances further below) is lacking the
"it ne" in order to be able to build it in thumb mode.
Does it fail to build? branch is the only instruction with conditional
encodings in thumb mode. The encodings which require an 'it' only allow
a larger immediate offset.
libavcodec/arm/mlpdsp_armv6.S:-915:9: error: unsupported relocation on symbol
bne _ff_mlp_pack_output
R_ARM_THM_JUMP19 against symbol `ff_mlp_pack_output' defined in .text section in libavcodec/libavcodec.a(mlpdsp.o)
Perhaps it's worth raising this as a fault against the toolchains? At the
time of assembly, the assembler must be aware that it's emitting a branch
to an external symbol, and it's stupid of it to use an encoding that its
native linker is unable to relocate. (Of course in the meantime we can
change libav to be more explicit about the encoding to use.)

For the record, Thumb has 4 different encodings for conditional branches,
which can be written

@ not in an it block
b<cond>.n <target>

it <cond>
b<cond>.n <target>

@ not in an it block
b<cond>.w <target>

it <cond>
b<cond>.w <target>

All four have different ranges (I list them above in increasing range
order, as well as more cycles to execute and lower code density). Is it
only the last one that's safe on all toolchains?

Even the last one only has a range of +/- 16 MB, and given the size of
the libavcodec binary, I'm guessing the linker probably already has to
generate some sort of trampolines to permit branching from any function
to any other in the resulting binary...

Ben
Martin Storsjö
2014-03-25 16:28:47 UTC
Permalink
Post by Ben Avison
An updated patch series. The main difference here is that for Thumb targets,
it's assumed that interworking is not supported, so individual functions are
either assembled as Thumb, or omitted if they cannot be supported without a
major refactoring.
truehd: add hand-scheduled ARM asm version of mlp_filter_channel.
truehd: break out part of rematrix_channels into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of
ff_mlp_rematrix_channel.
truehd: tune VLC decoding for ARM.
truehd: break out part of output_data into platform-specific
callback.
truehd: add hand-scheduled ARM asm version of ff_mlp_pack_output.
libavcodec/arm/Makefile | 3 +
libavcodec/arm/mlpdsp_arm.S | 655 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_armv6.S | 530 ++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 144 +++++++++
libavcodec/mlpdec.c | 90 +++---
libavcodec/mlpdsp.c | 73 +++++
libavcodec/mlpdsp.h | 46 +++
7 files changed, 1496 insertions(+), 45 deletions(-)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
All in all the series looks ok - any objections to me pushing this any day
soon, with "it ne" added before the conditional branches to C functions,
and with the altmacro parameter changed to use normal parameter syntax
(offset vs \offset in patch 1/6)?

// Martin
Ben Avison
2014-03-25 17:44:11 UTC
Permalink
Post by Martin Storsjö
All in all the series looks ok - any objections to me pushing this any
day soon, with "it ne" added before the conditional branches to C
functions, and with the altmacro parameter changed to use normal
parameter syntax (offset vs \offset in patch 1/6)?
Go for it. I didn't want to keep spamming the list with more and more
reposts of the series with only relatively minor and cosmetic changes,
so I was waiting for things to die down, but I'm also happy for you to
make the changes at the time you commit them.

Ben
Martin Storsjö
2014-03-26 07:32:57 UTC
Permalink
Post by Ben Avison
Post by Martin Storsjö
All in all the series looks ok - any objections to me pushing this any
day soon, with "it ne" added before the conditional branches to C
functions, and with the altmacro parameter changed to use normal
parameter syntax (offset vs \offset in patch 1/6)?
Go for it. I didn't want to keep spamming the list with more and more
reposts of the series with only relatively minor and cosmetic changes,
so I was waiting for things to die down, but I'm also happy for you to
make the changes at the time you commit them.
I found a few more details, but I can fix them up before pushing as well.

mlpdsp_arm.S uses ldrd, which is only available in armv5te (we've got one
armv4t and one armv5t setup on fate). Renaming the file to _armv5te.S and
building it conditionally within that setup is easy though.

The checks in mlpdsp_init_arm.c shouldn't just check cpu_flags, they also
need to take into account the fact that the optimizations might not be
built at all - which is handled by the have_armv5te() or have_armv6()
macros.

I've done these fixes locally and will include them when pushing.

// Martin
Martin Storsjö
2014-03-26 18:38:44 UTC
Permalink
Post by Martin Storsjö
Post by Ben Avison
Post by Martin Storsjö
All in all the series looks ok - any objections to me pushing this any
day soon, with "it ne" added before the conditional branches to C
functions, and with the altmacro parameter changed to use normal
parameter syntax (offset vs \offset in patch 1/6)?
Go for it. I didn't want to keep spamming the list with more and more
reposts of the series with only relatively minor and cosmetic changes,
so I was waiting for things to die down, but I'm also happy for you to
make the changes at the time you commit them.
I found a few more details, but I can fix them up before pushing as well.
mlpdsp_arm.S uses ldrd, which is only available in armv5te (we've got one
armv4t and one armv5t setup on fate). Renaming the file to _armv5te.S and
building it conditionally within that setup is easy though.
The checks in mlpdsp_init_arm.c shouldn't just check cpu_flags, they also
need to take into account the fact that the optimizations might not be built
at all - which is handled by the have_armv5te() or have_armv6() macros.
I've done these fixes locally and will include them when pushing.
... and pushed. Thanks for your contribution!

// Martin

Loading...