diff --git a/src/libs/decoders/dr_flac.h b/src/libs/decoders/dr_flac.h index a7527763..58ee1076 100644 --- a/src/libs/decoders/dr_flac.h +++ b/src/libs/decoders/dr_flac.h @@ -1,6 +1,6 @@ /* FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file. -dr_flac - v0.12.9 - 2020-04-05 +dr_flac - v0.12.11 - 2020-04-19 David Reid - mackron@gmail.com @@ -151,10 +151,7 @@ samples as it can, up to the amount requested. Later on when you need the next b } ``` -You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example, if you were to seek to the sample at -index 0 in a stereo stream, you'll be seeking to the first sample of the left channel. The sample at index 1 will be the first sample of the right channel. The -sample at index 2 will be the second sample of the left channel, etc. - +You can seek to a specific PCM frame with `drflac_seek_to_pcm_frame()`. If you just want to quickly decode an entire FLAC file in one go you can do something like this: @@ -1458,7 +1455,7 @@ DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterat #define DRFLAC_NO_CPUID #endif -static DRFLAC_INLINE drflac_bool32 drflac_has_sse2() +static DRFLAC_INLINE drflac_bool32 drflac_has_sse2(void) { #if defined(DRFLAC_SUPPORT_SSE2) #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE2) @@ -1483,7 +1480,7 @@ static DRFLAC_INLINE drflac_bool32 drflac_has_sse2() #endif } -static DRFLAC_INLINE drflac_bool32 drflac_has_sse41() +static DRFLAC_INLINE drflac_bool32 drflac_has_sse41(void) { #if defined(DRFLAC_SUPPORT_SSE41) #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE41) @@ -1673,7 +1670,7 @@ actually make sense. However, since CPU caps should never differ for a running p complicating internal API's by passing around CPU caps versus just disabling the warnings is worthwhile. I'm therefore just going to disable these warnings. This is disabled via the DRFLAC_NO_THREAD_SANITIZE attribute. */ -DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps() +DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void) { static drflac_bool32 isCPUCapsInitialized = DRFLAC_FALSE; @@ -1698,7 +1695,7 @@ DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps() #else static drflac_bool32 drflac__gIsNEONSupported = DRFLAC_FALSE; -static DRFLAC_INLINE drflac_bool32 drflac__has_neon() +static DRFLAC_INLINE drflac_bool32 drflac__has_neon(void) { #if defined(DRFLAC_SUPPORT_NEON) #if defined(DRFLAC_ARM) && !defined(DRFLAC_NO_NEON) @@ -1716,7 +1713,7 @@ static DRFLAC_INLINE drflac_bool32 drflac__has_neon() #endif } -DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps() +DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void) { drflac__gIsNEONSupported = drflac__has_neon(); @@ -1728,7 +1725,7 @@ DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps() /* Endian Management */ -static DRFLAC_INLINE drflac_bool32 drflac__is_little_endian() +static DRFLAC_INLINE drflac_bool32 drflac__is_little_endian(void) { #if defined(DRFLAC_X86) || defined(DRFLAC_X64) return DRFLAC_TRUE; @@ -2632,7 +2629,7 @@ static DRFLAC_INLINE drflac_uint32 drflac__clz_software(drflac_cache_t x) } #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT -static DRFLAC_INLINE drflac_bool32 drflac__is_lzcnt_supported() +static DRFLAC_INLINE drflac_bool32 drflac__is_lzcnt_supported(void) { /* Fast compile time check for ARM. */ #if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) @@ -4124,6 +4121,12 @@ static DRFLAC_INLINE void drflac__vst2q_s32(drflac_int32* p, int32x4x2_t x) vst1q_s32(p+4, x.val[1]); } +static DRFLAC_INLINE void drflac__vst2q_u32(drflac_uint32* p, uint32x4x2_t x) +{ + vst1q_u32(p+0, x.val[0]); + vst1q_u32(p+4, x.val[1]); +} + static DRFLAC_INLINE void drflac__vst2q_f32(float* p, float32x4x2_t x) { vst1q_f32(p+0, x.val[0]); @@ -4135,6 +4138,11 @@ static DRFLAC_INLINE void drflac__vst2q_s16(drflac_int16* p, int16x4x2_t x) vst1q_s16(p, vcombine_s16(x.val[0], x.val[1])); } +static DRFLAC_INLINE void drflac__vst2q_u16(drflac_uint16* p, uint16x4x2_t x) +{ + vst1q_u16(p, vcombine_u16(x.val[0], x.val[1])); +} + static DRFLAC_INLINE int32x4_t drflac__vdupq_n_s32x4(drflac_int32 x3, drflac_int32 x2, drflac_int32 x1, drflac_int32 x0) { drflac_int32 x[4]; @@ -5879,6 +5887,14 @@ static drflac_bool32 drflac__seek_to_pcm_frame__seek_table(drflac* pFlac, drflac iClosestSeekpoint = iSeekpoint; } + /* There's been cases where the seek table contains only zeros. We need to do some basic validation on the closest seekpoint. */ + if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) { + return DRFLAC_FALSE; + } + if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) { + return DRFLAC_FALSE; + } + #if !defined(DR_FLAC_NO_CRC) /* At this point we should know the closest seek point. We can use a binary search for this. We need to know the total sample count for this. */ if (pFlac->totalPCMFrameCount > 0) { @@ -5888,9 +5904,23 @@ static drflac_bool32 drflac__seek_to_pcm_frame__seek_table(drflac* pFlac, drflac byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f); byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset; + /* + If our closest seek point is not the last one, we only need to search between it and the next one. The section below calculates an appropriate starting + value for byteRangeHi which will clamp it appropriately. + + Note that the next seekpoint must have an offset greater than the closest seekpoint because otherwise our binary search algorithm will break down. There + have been cases where a seektable consists of seek points where every byte offset is set to 0 which causes problems. If this happens we need to abort. + */ if (iClosestSeekpoint < pFlac->seekpointCount-1) { - if (pFlac->pSeekpoints[iClosestSeekpoint+1].firstPCMFrame != (((drflac_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) { /* Is it a placeholder seekpoint. */ - byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint+1].flacFrameOffset-1; /* Must be zero based. */ + drflac_uint32 iNextSeekpoint = iClosestSeekpoint + 1; + + /* Basic validation on the seekpoints to ensure they're usable. */ + if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) { + return DRFLAC_FALSE; /* The next seekpoint doesn't look right. The seek table cannot be trusted from here. Abort. */ + } + + if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((drflac_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) { /* Make sure it's not a placeholder seekpoint. */ + byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1; /* byteRangeHi must be zero based. */ } } @@ -8585,78 +8615,77 @@ DRFLAC_API void drflac_close(drflac* pFlac) #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 side = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 right = left - side; + drflac_uint32 left = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 left0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 left1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 left2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 left3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 side0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 side1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 side2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 side3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 right0 = left0 - side0; - drflac_int32 right1 = left1 - side1; - drflac_int32 right2 = left2 - side2; - drflac_int32 right3 = left3 - side3; + drflac_uint32 right0 = left0 - side0; + drflac_uint32 right1 = left1 - side1; + drflac_uint32 right2 = left2 - side2; + drflac_uint32 right3 = left3 - side3; - pOutputSamples[i*8+0] = left0; - pOutputSamples[i*8+1] = right0; - pOutputSamples[i*8+2] = left1; - pOutputSamples[i*8+3] = right1; - pOutputSamples[i*8+4] = left2; - pOutputSamples[i*8+5] = right2; - pOutputSamples[i*8+6] = left3; - pOutputSamples[i*8+7] = right3; + pOutputSamples[i*8+0] = (drflac_int32)left0; + pOutputSamples[i*8+1] = (drflac_int32)right0; + pOutputSamples[i*8+2] = (drflac_int32)left1; + pOutputSamples[i*8+3] = (drflac_int32)right1; + pOutputSamples[i*8+4] = (drflac_int32)left2; + pOutputSamples[i*8+5] = (drflac_int32)right2; + pOutputSamples[i*8+6] = (drflac_int32)left3; + pOutputSamples[i*8+7] = (drflac_int32)right3; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - int left = pInputSamples0[i] << shift0; - int side = pInputSamples1[i] << shift1; - int right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - for (i = 0; i < frameCount4; ++i) { __m128i left = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); __m128i side = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1); @@ -8667,60 +8696,57 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drf } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); - drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); + drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -8743,78 +8769,77 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* p #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 right = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 left = right + side; + drflac_uint32 side = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 side0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 side1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 side2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 side3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 side0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 side1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 side2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 side3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 right0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 right1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 right2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 right3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 left0 = right0 + side0; - drflac_int32 left1 = right1 + side1; - drflac_int32 left2 = right2 + side2; - drflac_int32 left3 = right3 + side3; + drflac_uint32 left0 = right0 + side0; + drflac_uint32 left1 = right1 + side1; + drflac_uint32 left2 = right2 + side2; + drflac_uint32 left3 = right3 + side3; - pOutputSamples[i*8+0] = left0; - pOutputSamples[i*8+1] = right0; - pOutputSamples[i*8+2] = left1; - pOutputSamples[i*8+3] = right1; - pOutputSamples[i*8+4] = left2; - pOutputSamples[i*8+5] = right2; - pOutputSamples[i*8+6] = left3; - pOutputSamples[i*8+7] = right3; + pOutputSamples[i*8+0] = (drflac_int32)left0; + pOutputSamples[i*8+1] = (drflac_int32)right0; + pOutputSamples[i*8+2] = (drflac_int32)left1; + pOutputSamples[i*8+3] = (drflac_int32)right1; + pOutputSamples[i*8+4] = (drflac_int32)left2; + pOutputSamples[i*8+5] = (drflac_int32)right2; + pOutputSamples[i*8+6] = (drflac_int32)left3; + pOutputSamples[i*8+7] = (drflac_int32)right3; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - for (i = 0; i < frameCount4; ++i) { __m128i side = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1); @@ -8825,60 +8850,57 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(dr } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); - drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); + drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = left; - pOutputSamples[i*2+1] = right; + pOutputSamples[i*2+0] = (drflac_int32)left; + pOutputSamples[i*2+1] = (drflac_int32)right; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -8901,142 +8923,143 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - int mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - int side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) >> 1) << unusedBitsPerSample; - pOutputSamples[i*2+1] = ((mid - side) >> 1) << unusedBitsPerSample; + pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample); + pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; drflac_int32 shift = unusedBitsPerSample; + if (shift > 0) { shift -= 1; for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) << shift); - temp1L = ((mid1 + side1) << shift); - temp2L = ((mid2 + side2) << shift); - temp3L = ((mid3 + side3) << shift); + temp0L = (mid0 + side0) << shift; + temp1L = (mid1 + side1) << shift; + temp2L = (mid2 + side2) << shift; + temp3L = (mid3 + side3) << shift; - temp0R = ((mid0 - side0) << shift); - temp1R = ((mid1 - side1) << shift); - temp2R = ((mid2 - side2) << shift); - temp3R = ((mid3 - side3) << shift); + temp0R = (mid0 - side0) << shift; + temp1R = (mid1 - side1) << shift; + temp2R = (mid2 - side2) << shift; + temp3R = (mid3 - side3) << shift; - pOutputSamples[i*8+0] = temp0L; - pOutputSamples[i*8+1] = temp0R; - pOutputSamples[i*8+2] = temp1L; - pOutputSamples[i*8+3] = temp1R; - pOutputSamples[i*8+4] = temp2L; - pOutputSamples[i*8+5] = temp2R; - pOutputSamples[i*8+6] = temp3L; - pOutputSamples[i*8+7] = temp3R; + pOutputSamples[i*8+0] = (drflac_int32)temp0L; + pOutputSamples[i*8+1] = (drflac_int32)temp0R; + pOutputSamples[i*8+2] = (drflac_int32)temp1L; + pOutputSamples[i*8+3] = (drflac_int32)temp1R; + pOutputSamples[i*8+4] = (drflac_int32)temp2L; + pOutputSamples[i*8+5] = (drflac_int32)temp2R; + pOutputSamples[i*8+6] = (drflac_int32)temp3L; + pOutputSamples[i*8+7] = (drflac_int32)temp3R; } } else { for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) >> 1); - temp1L = ((mid1 + side1) >> 1); - temp2L = ((mid2 + side2) >> 1); - temp3L = ((mid3 + side3) >> 1); + temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1); + temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1); + temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1); + temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1); - temp0R = ((mid0 - side0) >> 1); - temp1R = ((mid1 - side1) >> 1); - temp2R = ((mid2 - side2) >> 1); - temp3R = ((mid3 - side3) >> 1); + temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1); + temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1); + temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1); + temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1); - pOutputSamples[i*8+0] = temp0L; - pOutputSamples[i*8+1] = temp0R; - pOutputSamples[i*8+2] = temp1L; - pOutputSamples[i*8+3] = temp1R; - pOutputSamples[i*8+4] = temp2L; - pOutputSamples[i*8+5] = temp2R; - pOutputSamples[i*8+6] = temp3L; - pOutputSamples[i*8+7] = temp3R; + pOutputSamples[i*8+0] = (drflac_int32)temp0L; + pOutputSamples[i*8+1] = (drflac_int32)temp0R; + pOutputSamples[i*8+2] = (drflac_int32)temp1L; + pOutputSamples[i*8+3] = (drflac_int32)temp1R; + pOutputSamples[i*8+4] = (drflac_int32)temp2L; + pOutputSamples[i*8+5] = (drflac_int32)temp2R; + pOutputSamples[i*8+6] = (drflac_int32)temp3L; + pOutputSamples[i*8+7] = (drflac_int32)temp3R; } } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) >> 1) << unusedBitsPerSample; - pOutputSamples[i*2+1] = ((mid - side) >> 1) << unusedBitsPerSample; + pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample); + pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample); } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; - int shift; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_int32 shift = unusedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift = unusedBitsPerSample; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { __m128i mid; @@ -9057,13 +9080,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) >> 1); - pOutputSamples[i*2+1] = ((mid - side) >> 1); + pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1; + pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1; } } else { shift -= 1; @@ -9086,63 +9109,62 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) << shift); - pOutputSamples[i*2+1] = ((mid - side) << shift); + pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift); + pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift); } } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; - int shift; - int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */ - int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */ - int32x4_t one4; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_int32 shift = unusedBitsPerSample; + int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */ + int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */ + uint32x4_t one4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - one4 = vdupq_n_s32(1); + one4 = vdupq_n_u32(1); - shift = unusedBitsPerSample; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, one4)); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4)); - left = vshrq_n_s32(vaddq_s32(mid, side), 1); - right = vshrq_n_s32(vsubq_s32(mid, side), 1); + left = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) >> 1); - pOutputSamples[i*2+1] = ((mid - side) >> 1); + pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1; + pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1; } } else { int32x4_t shift4; @@ -9151,36 +9173,36 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drfl shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, one4)); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4)); - left = vshlq_s32(vaddq_s32(mid, side), shift4); - right = vshlq_s32(vsubq_s32(mid, side), shift4); + left = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = ((mid + side) << shift); - pOutputSamples[i*2+1] = ((mid - side) << shift); + pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift); + pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift); } } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -9203,58 +9225,60 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pF #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - pOutputSamples[i*2+0] = (pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)); - pOutputSamples[i*2+1] = (pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)); + pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)); + pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 tempL0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 tempL1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 tempL2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 tempL3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 tempR0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 tempR1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 tempR2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 tempR3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1; - pOutputSamples[i*8+0] = tempL0; - pOutputSamples[i*8+1] = tempR0; - pOutputSamples[i*8+2] = tempL1; - pOutputSamples[i*8+3] = tempR1; - pOutputSamples[i*8+4] = tempL2; - pOutputSamples[i*8+5] = tempR2; - pOutputSamples[i*8+6] = tempL3; - pOutputSamples[i*8+7] = tempR3; + pOutputSamples[i*8+0] = (drflac_int32)tempL0; + pOutputSamples[i*8+1] = (drflac_int32)tempR0; + pOutputSamples[i*8+2] = (drflac_int32)tempL1; + pOutputSamples[i*8+3] = (drflac_int32)tempR1; + pOutputSamples[i*8+4] = (drflac_int32)tempL2; + pOutputSamples[i*8+5] = (drflac_int32)tempR2; + pOutputSamples[i*8+6] = (drflac_int32)tempL3; + pOutputSamples[i*8+7] = (drflac_int32)tempR3; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (pInputSamples0[i] << shift0); - pOutputSamples[i*2+1] = (pInputSamples1[i] << shift1); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0); + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1); } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - int shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - int shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { __m128i left = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); @@ -9265,20 +9289,21 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (pInputSamples0[i] << shift0); - pOutputSamples[i*2+1] = (pInputSamples1[i] << shift1); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0); + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1); } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - int shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - int shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift4_0 = vdupq_n_s32(shift0); int32x4_t shift4_1 = vdupq_n_s32(shift1); @@ -9287,20 +9312,20 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo_ int32x4_t left; int32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift4_0); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift4_1); + left = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0)); + right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1)); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (pInputSamples0[i] << shift0); - pOutputSamples[i*2+1] = (pInputSamples1[i] << shift1); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0); + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -9325,7 +9350,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo( DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut) { drflac_uint64 framesRead; - drflac_int32 unusedBitsPerSample; + drflac_uint32 unusedBitsPerSample; if (pFlac == NULL || framesToRead == 0) { return 0; @@ -9335,6 +9360,7 @@ DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead); } + DRFLAC_ASSERT(pFlac->bitsPerSample <= 32); unusedBitsPerSample = 32 - pFlac->bitsPerSample; framesRead = 0; @@ -9404,13 +9430,13 @@ DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 side = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 right = left - side; + drflac_uint32 left = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 right = left - side; left >>= 16; right >>= 16; @@ -9421,28 +9447,30 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__referenc } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 left0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 left1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 left2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 left3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 side0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 side1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 side2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 side3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 right0 = left0 - side0; - drflac_int32 right1 = left1 - side1; - drflac_int32 right2 = left2 - side2; - drflac_int32 right3 = left3 - side3; + drflac_uint32 right0 = left0 - side0; + drflac_uint32 right1 = left1 - side1; + drflac_uint32 right2 = left2 - side2; + drflac_uint32 right3 = left3 - side3; left0 >>= 16; left1 >>= 16; @@ -9465,9 +9493,9 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(d } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; left >>= 16; right >>= 16; @@ -9478,20 +9506,17 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(d } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - for (i = 0; i < frameCount4; ++i) { __m128i left = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); __m128i side = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1); @@ -9504,9 +9529,9 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drf } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; left >>= 16; right >>= 16; @@ -9518,44 +9543,41 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drf #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); - left = vshrq_n_s32(left, 16); - right = vshrq_n_s32(right, 16); + left = vshrq_n_u32(left, 16); + right = vshrq_n_u32(right, 16); - drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right))); + drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right))); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; left >>= 16; right >>= 16; @@ -9566,7 +9588,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drf } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -9589,13 +9611,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* p #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 right = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 left = right + side; + drflac_uint32 side = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 left = right + side; left >>= 16; right >>= 16; @@ -9606,28 +9628,30 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__referen } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 side0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 side1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 side2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 side3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 side0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 side1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 side2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 side3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 right0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 right1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 right2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 right3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 left0 = right0 + side0; - drflac_int32 left1 = right1 + side1; - drflac_int32 left2 = right2 + side2; - drflac_int32 left3 = right3 + side3; + drflac_uint32 left0 = right0 + side0; + drflac_uint32 left1 = right1 + side1; + drflac_uint32 left2 = right2 + side2; + drflac_uint32 left3 = right3 + side3; left0 >>= 16; left1 >>= 16; @@ -9650,9 +9674,9 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar( } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; left >>= 16; right >>= 16; @@ -9663,20 +9687,17 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar( } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - for (i = 0; i < frameCount4; ++i) { __m128i side = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1); @@ -9689,9 +9710,9 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(dr } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; left >>= 16; right >>= 16; @@ -9703,44 +9724,41 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(dr #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); - left = vshrq_n_s32(left, 16); - right = vshrq_n_s32(right, 16); + left = vshrq_n_u32(left, 16); + right = vshrq_n_u32(right, 16); - drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right))); + drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right))); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; left >>= 16; right >>= 16; @@ -9751,7 +9769,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(dr } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -9774,62 +9792,64 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (drflac_int16)((((mid + side) >> 1) << unusedBitsPerSample) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((((mid - side) >> 1) << unusedBitsPerSample) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample; - int shift = unusedBitsPerSample; if (shift > 0) { shift -= 1; for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) << shift); - temp1L = ((mid1 + side1) << shift); - temp2L = ((mid2 + side2) << shift); - temp3L = ((mid3 + side3) << shift); + temp0L = (mid0 + side0) << shift; + temp1L = (mid1 + side1) << shift; + temp2L = (mid2 + side2) << shift; + temp3L = (mid3 + side3) << shift; - temp0R = ((mid0 - side0) << shift); - temp1R = ((mid1 - side1) << shift); - temp2R = ((mid2 - side2) << shift); - temp3R = ((mid3 - side3) << shift); + temp0R = (mid0 - side0) << shift; + temp1R = (mid1 - side1) << shift; + temp2R = (mid2 - side2) << shift; + temp3R = (mid3 - side3) << shift; temp0L >>= 16; temp1L >>= 16; @@ -9852,39 +9872,39 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(dr } } else { for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) >> 1); - temp1L = ((mid1 + side1) >> 1); - temp2L = ((mid2 + side2) >> 1); - temp3L = ((mid3 + side3) >> 1); + temp0L = ((drflac_int32)(mid0 + side0) >> 1); + temp1L = ((drflac_int32)(mid1 + side1) >> 1); + temp2L = ((drflac_int32)(mid2 + side2) >> 1); + temp3L = ((drflac_int32)(mid3 + side3) >> 1); - temp0R = ((mid0 - side0) >> 1); - temp1R = ((mid1 - side1) >> 1); - temp2R = ((mid2 - side2) >> 1); - temp3R = ((mid3 - side3) >> 1); + temp0R = ((drflac_int32)(mid0 - side0) >> 1); + temp1R = ((drflac_int32)(mid1 - side1) >> 1); + temp2R = ((drflac_int32)(mid2 - side2) >> 1); + temp3R = ((drflac_int32)(mid3 - side3) >> 1); temp0L >>= 16; temp1L >>= 16; @@ -9908,28 +9928,27 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(dr } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (drflac_int16)((((mid + side) >> 1) << unusedBitsPerSample) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((((mid - side) >> 1) << unusedBitsPerSample) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16); } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; - drflac_int32 shift; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - shift = unusedBitsPerSample; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { __m128i mid; @@ -9952,13 +9971,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) >> 1) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) >> 1) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16); } } else { shift -= 1; @@ -9983,10 +10002,10 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16); pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16); @@ -9996,36 +10015,35 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drfl #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; - int shift; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample; int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */ int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */ DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift = unusedBitsPerSample; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - left = vshrq_n_s32(vaddq_s32(mid, side), 1); - right = vshrq_n_s32(vsubq_s32(mid, side), 1); + left = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10034,13 +10052,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) >> 1) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) >> 1) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16); } } else { int32x4_t shift4; @@ -10049,18 +10067,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - left = vshlq_s32(vaddq_s32(mid, side), shift4); - right = vshlq_s32(vsubq_s32(mid, side), shift4); + left = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10069,10 +10087,10 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16); pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16); @@ -10081,7 +10099,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -10104,33 +10122,34 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pF #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - int shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - int shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 tempL0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 tempL1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 tempL2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 tempL3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 tempR0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 tempR1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 tempR2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 tempR3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1; tempL0 >>= 16; tempL1 >>= 16; @@ -10153,19 +10172,20 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0[i] << shift0) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1[i] << shift1) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16); } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { __m128i left = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); @@ -10179,20 +10199,21 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0[i] << shift0) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1[i] << shift1) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16); } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; int32x4_t shift0_4 = vdupq_n_s32(shift0); int32x4_t shift1_4 = vdupq_n_s32(shift1); @@ -10201,8 +10222,8 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo_ int32x4_t left; int32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); + left = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4)); + right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4)); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10211,13 +10232,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0[i] << shift0) >> 16); - pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1[i] << shift1) >> 16); + pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16); + pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -10241,7 +10262,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo( DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut) { drflac_uint64 framesRead; - drflac_int32 unusedBitsPerSample; + drflac_uint32 unusedBitsPerSample; if (pFlac == NULL || framesToRead == 0) { return 0; @@ -10251,6 +10272,7 @@ DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead); } + DRFLAC_ASSERT(pFlac->bitsPerSample <= 32); unusedBitsPerSample = 32 - pFlac->bitsPerSample; framesRead = 0; @@ -10321,81 +10343,81 @@ DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 side = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 right = left - side; + drflac_uint32 left = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = (float)(left / 2147483648.0); - pOutputSamples[i*2+1] = (float)(right / 2147483648.0); + pOutputSamples[i*2+0] = (float)((drflac_int32)left / 2147483648.0); + pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; float factor = 1 / 2147483648.0; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 left0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 left1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 left2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 left3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 side0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 side1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 side2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 side3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 right0 = left0 - side0; - drflac_int32 right1 = left1 - side1; - drflac_int32 right2 = left2 - side2; - drflac_int32 right3 = left3 - side3; + drflac_uint32 right0 = left0 - side0; + drflac_uint32 right1 = left1 - side1; + drflac_uint32 right2 = left2 - side2; + drflac_uint32 right3 = left3 - side3; - pOutputSamples[i*8+0] = left0 * factor; - pOutputSamples[i*8+1] = right0 * factor; - pOutputSamples[i*8+2] = left1 * factor; - pOutputSamples[i*8+3] = right1 * factor; - pOutputSamples[i*8+4] = left2 * factor; - pOutputSamples[i*8+5] = right2 * factor; - pOutputSamples[i*8+6] = left3 * factor; - pOutputSamples[i*8+7] = right3 * factor; + pOutputSamples[i*8+0] = (drflac_int32)left0 * factor; + pOutputSamples[i*8+1] = (drflac_int32)right0 * factor; + pOutputSamples[i*8+2] = (drflac_int32)left1 * factor; + pOutputSamples[i*8+3] = (drflac_int32)right1 * factor; + pOutputSamples[i*8+4] = (drflac_int32)left2 * factor; + pOutputSamples[i*8+5] = (drflac_int32)right2 * factor; + pOutputSamples[i*8+6] = (drflac_int32)left3 * factor; + pOutputSamples[i*8+7] = (drflac_int32)right3 * factor; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = (float)(left * factor); - pOutputSamples[i*2+1] = (float)(right * factor); + pOutputSamples[i*2+0] = (drflac_int32)left * factor; + pOutputSamples[i*2+1] = (drflac_int32)right * factor; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; __m128 factor; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - factor = _mm_set1_ps(1.0f / 8388608.0f); - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; for (i = 0; i < frameCount4; ++i) { __m128i left = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); @@ -10409,67 +10431,63 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drf } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = (float)(left / 8388608.0f); - pOutputSamples[i*2+1] = (float)(right / 8388608.0f); + pOutputSamples[i*2+0] = (drflac_int32)left / 8388608.0f; + pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f; } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; float32x4_t factor4; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - factor4 = vdupq_n_f32(1.0f / 8388608.0f); - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; - + factor4 = vdupq_n_f32(1.0f / 8388608.0f); shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; float32x4_t leftf; float32x4_t rightf; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); - leftf = vmulq_f32(vcvtq_f32_s32(left), factor4); - rightf = vmulq_f32(vcvtq_f32_s32(right), factor4); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); + leftf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)), factor4); + rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4); drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 left = pInputSamples0[i] << shift0; - drflac_int32 side = pInputSamples1[i] << shift1; - drflac_int32 right = left - side; + drflac_uint32 left = pInputSamples0U32[i] << shift0; + drflac_uint32 side = pInputSamples1U32[i] << shift1; + drflac_uint32 right = left - side; - pOutputSamples[i*2+0] = (float)(left / 8388608.0f); - pOutputSamples[i*2+1] = (float)(right / 8388608.0f); + pOutputSamples[i*2+0] = (drflac_int32)left / 8388608.0f; + pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -10492,81 +10510,80 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* p #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; for (i = 0; i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 right = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - drflac_int32 left = right + side; + drflac_uint32 side = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); + drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = (float)(left / 2147483648.0); - pOutputSamples[i*2+1] = (float)(right / 2147483648.0); + pOutputSamples[i*2+0] = (float)((drflac_int32)left / 2147483648.0); + pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; float factor = 1 / 2147483648.0; - drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; for (i = 0; i < frameCount4; ++i) { - drflac_int32 side0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 side1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 side2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 side3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 side0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 side1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 side2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 side3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 right0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 right1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 right2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 right3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1; - drflac_int32 left0 = right0 + side0; - drflac_int32 left1 = right1 + side1; - drflac_int32 left2 = right2 + side2; - drflac_int32 left3 = right3 + side3; + drflac_uint32 left0 = right0 + side0; + drflac_uint32 left1 = right1 + side1; + drflac_uint32 left2 = right2 + side2; + drflac_uint32 left3 = right3 + side3; - pOutputSamples[i*8+0] = left0 * factor; - pOutputSamples[i*8+1] = right0 * factor; - pOutputSamples[i*8+2] = left1 * factor; - pOutputSamples[i*8+3] = right1 * factor; - pOutputSamples[i*8+4] = left2 * factor; - pOutputSamples[i*8+5] = right2 * factor; - pOutputSamples[i*8+6] = left3 * factor; - pOutputSamples[i*8+7] = right3 * factor; + pOutputSamples[i*8+0] = (drflac_int32)left0 * factor; + pOutputSamples[i*8+1] = (drflac_int32)right0 * factor; + pOutputSamples[i*8+2] = (drflac_int32)left1 * factor; + pOutputSamples[i*8+3] = (drflac_int32)right1 * factor; + pOutputSamples[i*8+4] = (drflac_int32)left2 * factor; + pOutputSamples[i*8+5] = (drflac_int32)right2 * factor; + pOutputSamples[i*8+6] = (drflac_int32)left3 * factor; + pOutputSamples[i*8+7] = (drflac_int32)right3 * factor; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = (float)(left * factor); - pOutputSamples[i*2+1] = (float)(right * factor); + pOutputSamples[i*2+0] = (drflac_int32)left * factor; + pOutputSamples[i*2+1] = (drflac_int32)right * factor; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; __m128 factor; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - factor = _mm_set1_ps(1.0f / 8388608.0f); - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; for (i = 0; i < frameCount4; ++i) { __m128i side = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0); @@ -10580,67 +10597,63 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(dr } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = (float)(left / 8388608.0f); - pOutputSamples[i*2+1] = (float)(right / 8388608.0f); + pOutputSamples[i*2+0] = (drflac_int32)left / 8388608.0f; + pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f; } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { - drflac_uint64 frameCount4; - drflac_int32 shift0; - drflac_int32 shift1; drflac_uint64 i; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; float32x4_t factor4; int32x4_t shift0_4; int32x4_t shift1_4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - factor4 = vdupq_n_f32(1.0f / 8388608.0f); - - shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; - + factor4 = vdupq_n_f32(1.0f / 8388608.0f); shift0_4 = vdupq_n_s32(shift0); shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; float32x4_t leftf; float32x4_t rightf; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); - leftf = vmulq_f32(vcvtq_f32_s32(left), factor4); - rightf = vmulq_f32(vcvtq_f32_s32(right), factor4); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); + leftf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)), factor4); + rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4); drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 side = pInputSamples0[i] << shift0; - drflac_int32 right = pInputSamples1[i] << shift1; - drflac_int32 left = right + side; + drflac_uint32 side = pInputSamples0U32[i] << shift0; + drflac_uint32 right = pInputSamples1U32[i] << shift1; + drflac_uint32 left = right + side; - pOutputSamples[i*2+0] = (float)(left / 8388608.0f); - pOutputSamples[i*2+1] = (float)(right / 8388608.0f); + pOutputSamples[i*2+0] = (drflac_int32)left / 8388608.0f; + pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -10663,149 +10676,149 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)((((mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0); - pOutputSamples[i*2+1] = (float)((((mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0); + pOutputSamples[i*2+0] = (float)((((drflac_int32)(mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0); + pOutputSamples[i*2+1] = (float)((((drflac_int32)(mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample; float factor = 1 / 2147483648.0; - int shift = unusedBitsPerSample; if (shift > 0) { shift -= 1; for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) << shift); - temp1L = ((mid1 + side1) << shift); - temp2L = ((mid2 + side2) << shift); - temp3L = ((mid3 + side3) << shift); + temp0L = (mid0 + side0) << shift; + temp1L = (mid1 + side1) << shift; + temp2L = (mid2 + side2) << shift; + temp3L = (mid3 + side3) << shift; - temp0R = ((mid0 - side0) << shift); - temp1R = ((mid1 - side1) << shift); - temp2R = ((mid2 - side2) << shift); - temp3R = ((mid3 - side3) << shift); + temp0R = (mid0 - side0) << shift; + temp1R = (mid1 - side1) << shift; + temp2R = (mid2 - side2) << shift; + temp3R = (mid3 - side3) << shift; - pOutputSamples[i*8+0] = (float)(temp0L * factor); - pOutputSamples[i*8+1] = (float)(temp0R * factor); - pOutputSamples[i*8+2] = (float)(temp1L * factor); - pOutputSamples[i*8+3] = (float)(temp1R * factor); - pOutputSamples[i*8+4] = (float)(temp2L * factor); - pOutputSamples[i*8+5] = (float)(temp2R * factor); - pOutputSamples[i*8+6] = (float)(temp3L * factor); - pOutputSamples[i*8+7] = (float)(temp3R * factor); + pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor; + pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor; + pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor; + pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor; + pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor; + pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor; + pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor; + pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor; } } else { for (i = 0; i < frameCount4; ++i) { - drflac_int32 temp0L; - drflac_int32 temp1L; - drflac_int32 temp2L; - drflac_int32 temp3L; - drflac_int32 temp0R; - drflac_int32 temp1R; - drflac_int32 temp2R; - drflac_int32 temp3R; + drflac_uint32 temp0L; + drflac_uint32 temp1L; + drflac_uint32 temp2L; + drflac_uint32 temp3L; + drflac_uint32 temp0R; + drflac_uint32 temp1R; + drflac_uint32 temp2R; + drflac_uint32 temp3R; - drflac_int32 mid0 = pInputSamples0[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid1 = pInputSamples0[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid2 = pInputSamples0[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 mid3 = pInputSamples0[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid0 = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid1 = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid2 = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 mid3 = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side0 = pInputSamples1[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side1 = pInputSamples1[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side2 = pInputSamples1[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - drflac_int32 side3 = pInputSamples1[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01); - mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01); - mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01); - mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01); + mid0 = (mid0 << 1) | (side0 & 0x01); + mid1 = (mid1 << 1) | (side1 & 0x01); + mid2 = (mid2 << 1) | (side2 & 0x01); + mid3 = (mid3 << 1) | (side3 & 0x01); - temp0L = ((mid0 + side0) >> 1); - temp1L = ((mid1 + side1) >> 1); - temp2L = ((mid2 + side2) >> 1); - temp3L = ((mid3 + side3) >> 1); + temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1); + temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1); + temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1); + temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1); - temp0R = ((mid0 - side0) >> 1); - temp1R = ((mid1 - side1) >> 1); - temp2R = ((mid2 - side2) >> 1); - temp3R = ((mid3 - side3) >> 1); + temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1); + temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1); + temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1); + temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1); - pOutputSamples[i*8+0] = (float)(temp0L * factor); - pOutputSamples[i*8+1] = (float)(temp0R * factor); - pOutputSamples[i*8+2] = (float)(temp1L * factor); - pOutputSamples[i*8+3] = (float)(temp1R * factor); - pOutputSamples[i*8+4] = (float)(temp2L * factor); - pOutputSamples[i*8+5] = (float)(temp2R * factor); - pOutputSamples[i*8+6] = (float)(temp3L * factor); - pOutputSamples[i*8+7] = (float)(temp3R * factor); + pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor; + pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor; + pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor; + pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor; + pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor; + pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor; + pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor; + pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor; } } for (i = (frameCount4 << 2); i < frameCount; ++i) { - int mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - int side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)((((mid + side) >> 1) << unusedBitsPerSample) * factor); - pOutputSamples[i*2+1] = (float)((((mid - side) >> 1) << unusedBitsPerSample) * factor); + pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor; + pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample - 8; float factor; - drflac_int32 shift; __m128 factor128; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - factor = 1.0f / 8388608.0f; - factor128 = _mm_set1_ps(1.0f / 8388608.0f); + factor128 = _mm_set1_ps(factor); - shift = unusedBitsPerSample - 8; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { __m128i mid; @@ -10831,13 +10844,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)(((mid + side) >> 1) * factor); - pOutputSamples[i*2+1] = (float)(((mid - side) >> 1) * factor); + pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor; + pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor; } } else { shift -= 1; @@ -10865,25 +10878,27 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)(((mid + side) << shift) * factor); - pOutputSamples[i*2+1] = (float)(((mid - side) << shift) * factor); + pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor; + pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor; } } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; - drflac_uint64 frameCount4; + drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift = unusedBitsPerSample - 8; float factor; - drflac_int32 shift; float32x4_t factor4; int32x4_t shift4; int32x4_t wbps0_4; /* Wasted Bits Per Sample */ @@ -10891,15 +10906,11 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); - frameCount4 = frameCount >> 2; - - factor = 1.0f / 8388608.0f; + factor = 1.0f / 8388608.0f; factor4 = vdupq_n_f32(factor); - wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - shift = unusedBitsPerSample - 8; if (shift == 0) { for (i = 0; i < frameCount4; ++i) { int32x4_t lefti; @@ -10907,13 +10918,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl float32x4_t leftf; float32x4_t rightf; - int32x4_t mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbps0_4); - int32x4_t side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbps1_4); + uint32x4_t mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4); + uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - lefti = vshrq_n_s32(vaddq_s32(mid, side), 1); - righti = vshrq_n_s32(vsubq_s32(mid, side), 1); + lefti = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -10922,32 +10933,32 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)(((mid + side) >> 1) * factor); - pOutputSamples[i*2+1] = (float)(((mid - side) >> 1) * factor); + pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor; + pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor; } } else { shift -= 1; shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t lefti; int32x4_t righti; float32x4_t leftf; float32x4_t rightf; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbps0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbps1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - lefti = vshlq_s32(vaddq_s32(mid, side), shift4); - righti = vshlq_s32(vsubq_s32(mid, side), shift4); + lefti = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -10956,19 +10967,19 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl } for (i = (frameCount4 << 2); i < frameCount; ++i) { - drflac_int32 mid = pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; - drflac_int32 side = pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; + drflac_uint32 mid = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; - mid = (((drflac_uint32)mid) << 1) | (side & 0x01); + mid = (mid << 1) | (side & 0x01); - pOutputSamples[i*2+0] = (float)(((mid + side) << shift) * factor); - pOutputSamples[i*2+1] = (float)(((mid - side) << shift) * factor); + pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor; + pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor; } } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -10990,63 +11001,64 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pF } #if 0 -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { for (drflac_uint64 i = 0; i < frameCount; ++i) { - pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0); - pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0); + pOutputSamples[i*2+0] = (float)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0); + pOutputSamples[i*2+1] = (float)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0); } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; - + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample; + drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample; float factor = 1 / 2147483648.0; - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - for (i = 0; i < frameCount4; ++i) { - drflac_int32 tempL0 = pInputSamples0[i*4+0] << shift0; - drflac_int32 tempL1 = pInputSamples0[i*4+1] << shift0; - drflac_int32 tempL2 = pInputSamples0[i*4+2] << shift0; - drflac_int32 tempL3 = pInputSamples0[i*4+3] << shift0; + drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0; + drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0; + drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0; + drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0; - drflac_int32 tempR0 = pInputSamples1[i*4+0] << shift1; - drflac_int32 tempR1 = pInputSamples1[i*4+1] << shift1; - drflac_int32 tempR2 = pInputSamples1[i*4+2] << shift1; - drflac_int32 tempR3 = pInputSamples1[i*4+3] << shift1; + drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1; + drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1; + drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1; + drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1; - pOutputSamples[i*8+0] = (float)(tempL0 * factor); - pOutputSamples[i*8+1] = (float)(tempR0 * factor); - pOutputSamples[i*8+2] = (float)(tempL1 * factor); - pOutputSamples[i*8+3] = (float)(tempR1 * factor); - pOutputSamples[i*8+4] = (float)(tempL2 * factor); - pOutputSamples[i*8+5] = (float)(tempR2 * factor); - pOutputSamples[i*8+6] = (float)(tempL3 * factor); - pOutputSamples[i*8+7] = (float)(tempR3 * factor); + pOutputSamples[i*8+0] = (drflac_int32)tempL0 * factor; + pOutputSamples[i*8+1] = (drflac_int32)tempR0 * factor; + pOutputSamples[i*8+2] = (drflac_int32)tempL1 * factor; + pOutputSamples[i*8+3] = (drflac_int32)tempR1 * factor; + pOutputSamples[i*8+4] = (drflac_int32)tempL2 * factor; + pOutputSamples[i*8+5] = (drflac_int32)tempR2 * factor; + pOutputSamples[i*8+6] = (drflac_int32)tempL3 * factor; + pOutputSamples[i*8+7] = (drflac_int32)tempR3 * factor; } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << shift0) * factor); - pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << shift1) * factor); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor; + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor; } } #if defined(DRFLAC_SUPPORT_SSE2) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; float factor = 1.0f / 8388608.0f; - __m128 factor128 = _mm_set1_ps(1.0f / 8388608.0f); - - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; + __m128 factor128 = _mm_set1_ps(factor); for (i = 0; i < frameCount4; ++i) { __m128i lefti; @@ -11065,26 +11077,26 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << shift0) * factor); - pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << shift1) * factor); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor; + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor; } } #endif #if defined(DRFLAC_SUPPORT_NEON) -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { drflac_uint64 i; drflac_uint64 frameCount4 = frameCount >> 2; + const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; + const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; + drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; + drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; float factor = 1.0f / 8388608.0f; float32x4_t factor4 = vdupq_n_f32(factor); - - drflac_int32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8; - drflac_int32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8; - - int32x4_t shift0_4 = vdupq_n_s32(shift0); - int32x4_t shift1_4 = vdupq_n_s32(shift1); + int32x4_t shift0_4 = vdupq_n_s32(shift0); + int32x4_t shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { int32x4_t lefti; @@ -11092,8 +11104,8 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo_ float32x4_t leftf; float32x4_t rightf; - lefti = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - righti = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); + lefti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4)); + righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4)); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -11102,13 +11114,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo_ } for (i = (frameCount4 << 2); i < frameCount; ++i) { - pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << shift0) * factor); - pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << shift1) * factor); + pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor; + pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor; } } #endif -static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) +static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples) { #if defined(DRFLAC_SUPPORT_SSE2) if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) { @@ -11132,7 +11144,7 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo( DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut) { drflac_uint64 framesRead; - drflac_int32 unusedBitsPerSample; + drflac_uint32 unusedBitsPerSample; if (pFlac == NULL || framesToRead == 0) { return 0; @@ -11142,6 +11154,7 @@ DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead); } + DRFLAC_ASSERT(pFlac->bitsPerSample <= 32); unusedBitsPerSample = 32 - pFlac->bitsPerSample; framesRead = 0; @@ -11692,6 +11705,13 @@ DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterat /* REVISION HISTORY ================ +v0.12.11 - 2020-04-19 + - Fix some pedantic warnings. + - Fix some undefined behaviour warnings. + +v0.12.10 - 2020-04-10 + - Fix some bugs when trying to seek with an invalid seek table. + v0.12.9 - 2020-04-05 - Fix warnings.