mumble-voip · Krzmbrzl · Jun 14, 2021 · May 31, 2021
diff --git a/src/mumble/Audio.h b/src/mumble/Audio.h
@@ -17,6 +17,12 @@
 
 #define SAMPLE_RATE 48000
 
+// interaural delay (in samples) for a sound coming directly from the side of the head
+// A Wikipedia article claims the average distance between ears is 15.2 cm for men
+// (0.44 ms) and 14.4 cm for women (0.42 ms). We decided to set the delay to 0.43 ms.
+// The delay is calculated from the distance and the speed of sound.
+constexpr float INTERAURAL_DELAY = 0.00043 / (1 / static_cast< float >(SAMPLE_RATE));
+
 typedef QPair< QString, QVariant > audioDevice;
 
 class LoopUser : public ClientUser {

diff --git a/src/mumble/AudioOutput.cpp b/src/mumble/AudioOutput.cpp
@@ -588,29 +588,59 @@ bool AudioOutput::mix(void *outbuff, unsigned int frameCount) {
 						aop->pfVolume[s] = -1.0;
 				}
 
+				if (!aop->piOffset) {
+					aop->piOffset = std::make_unique< unsigned int[] >(nchan);
+					for (unsigned int s = 0; s < nchan; ++s) {
+						aop->piOffset[s] = 0;
+					}
+				}
+
 				for (unsigned int s = 0; s < nchan; ++s) {
 					const float dot = bSpeakerPositional[s]
 										  ? connectionVec.x * speaker[s * 3 + 0] + connectionVec.y * speaker[s * 3 + 1]
 												+ connectionVec.z * speaker[s * 3 + 2]
 										  : 1.0f;
-					const float str   = svol[s] * calcGain(dot, len) * volumeAdjustment;
+					// Volume on the ear opposite to the sound should never reach 0 in the real world.
+					// The gain is multiplied by 19/20 and 1/20 is added. This will have the effect
+					// of bringing the lowest value up to 1/20, while keeping the highest value at 1.
+					// E.g. calcGain() = 1; 1 * 19/20 + 1/20 = 0.95 + 0.05 = 1
+					// calcGain() = 0; 0 * 19/20 + 1/20 = 0 + 0.05 = 0.05
+					const float str   = svol[s] * (1 / 20.0 + (19 / 20.0) * calcGain(dot, len)) * volumeAdjustment;
 					float *RESTRICT o = output + s;
 					const float old   = (aop->pfVolume[s] >= 0.0f) ? aop->pfVolume[s] : str;
 					const float inc   = (str - old) / static_cast< float >(frameCount);
 					aop->pfVolume[s]  = str;
+
+					// Calculates the ITD offset of the audio data this frame.
+					// Interaural Time Delay (ITD) is a small time delay between your ears
+					// depending on the sound source position on the horizonal plane and the
+					// distance between your ears.
+					//
+					// Offset for ITD is not applied directly, but rather the offset is interpolated
+					// linearly across the entire chunk, between the offset of the last chunk and the
+					// newly calculated offset for this chunk. This prevents clicking / buzzing when the
+					// audio source or camera is moving, because abruptly changing offsets (and thus
+					// abruptly changing the playback position) will create a clicking noise.
+					const int offset =
+						INTERAURAL_DELAY * (1.0 + dot) / 2.0; // Normalize dot to range [0,1] instead [-1,1]
+					const int oldOffset   = aop->piOffset[s];
+					const float incOffset = (offset - oldOffset) / static_cast< float >(frameCount);
+					aop->piOffset[s]      = offset;
 					/*
 										qWarning("%d: Pos %f %f %f : Dot %f Len %f Str %f", s, speaker[s*3+0],
 					   speaker[s*3+1], speaker[s*3+2], dot, len, str);
 					*/
 					if ((old >= 0.00000001f) || (str >= 0.00000001f)) {
 						for (unsigned int i = 0; i < frameCount; ++i) {
+							unsigned int currentOffset = oldOffset + incOffset * i;
 							if (speech && speech->bStereo) {
 								// Mix stereo user's stream into mono
 								// frame: for a stereo stream, the [LR] pair inside ...[LR]LRLRLR.... is a frame
-								o[i * nchan] += (pfBuffer[2 * i] / 2.0 + pfBuffer[2 * i + 1] / 2.0)
-												* (old + inc * static_cast< float >(i));
+								o[i * nchan] +=
+									(pfBuffer[2 * i + currentOffset] / 2.0 + pfBuffer[2 * i + currentOffset + 1] / 2.0)
+									* (old + inc * static_cast< float >(i));
 							} else {
-								o[i * nchan] += pfBuffer[i] * (old + inc * static_cast< float >(i));
+								o[i * nchan] += pfBuffer[i + currentOffset] * (old + inc * static_cast< float >(i));
 							}
 						}
 					}

diff --git a/src/mumble/AudioOutputSample.cpp b/src/mumble/AudioOutputSample.cpp
@@ -228,7 +228,8 @@ bool AudioOutputSample::prepareSampleBuffer(unsigned int frameCount) {
 	iLastConsume = sampleCount;
 
 	// Check if we can satisfy request with current buffer
-	if (iBufferFilled >= sampleCount)
+	// Maximum interaural delay is accounted for to prevent audio glitches
+	if (iBufferFilled >= sampleCount + INTERAURAL_DELAY)
 		return true;
 
 	// Calculate the required buffersize to hold the results
@@ -241,7 +242,7 @@ bool AudioOutputSample::prepareSampleBuffer(unsigned int frameCount) {
 	bool eof = false;
 	sf_count_t read;
 	do {
-		resizeBuffer(iBufferFilled + sampleCount);
+		resizeBuffer(iBufferFilled + sampleCount + INTERAURAL_DELAY);
 
 		// If we need to resample, write to the buffer on stack
 		float *pOut = (srs) ? fOut : pfBuffer + iBufferFilled;
@@ -270,7 +271,7 @@ bool AudioOutputSample::prepareSampleBuffer(unsigned int frameCount) {
 		}
 
 		iBufferFilled += outlen * channels;
-	} while (iBufferFilled < sampleCount);
+	} while (iBufferFilled < sampleCount + INTERAURAL_DELAY);
 
 	if (eof && !bEof) {
 		emit playbackFinished();

diff --git a/src/mumble/AudioOutputSpeech.cpp b/src/mumble/AudioOutputSpeech.cpp
@@ -231,15 +231,16 @@ bool AudioOutputSpeech::prepareSampleBuffer(unsigned int frameCount) {
 
 	iLastConsume = sampleCount;
 
-	if (iBufferFilled >= sampleCount)
+	// Maximum interaural delay is accounted for to prevent audio glitches
+	if (iBufferFilled >= sampleCount + INTERAURAL_DELAY)
 		return bLastAlive;
 
 	float *pOut;
 	bool nextalive = bLastAlive;
 
-	while (iBufferFilled < sampleCount) {
+	while (iBufferFilled < sampleCount + INTERAURAL_DELAY) {
 		int decodedSamples = iFrameSize;
-		resizeBuffer(iBufferFilled + iOutputSize);
+		resizeBuffer(iBufferFilled + iOutputSize + INTERAURAL_DELAY);
 		// TODO: allocating memory in the audio callback will crash mumble in some cases.
 		//       we need to initialize the buffer with an appropriate size when initializing
 		//       this class. See #4250.

diff --git a/src/mumble/AudioOutputUser.h b/src/mumble/AudioOutputUser.h
@@ -7,6 +7,7 @@
 #define MUMBLE_MUMBLE_AUDIOOUTPUTUSER_H_
 
 #include <QtCore/QObject>
+#include <memory>
 
 class AudioOutputUser : public QObject {
 private:
@@ -28,7 +29,8 @@ class AudioOutputUser : public QObject {
 	const QString qsName;
 	float *pfBuffer = nullptr;
 	float *pfVolume = nullptr;
-	float fPos[3]   = { 0.0, 0.0, 0.0 };
+	std::unique_ptr< unsigned int[] > piOffset;
+	float fPos[3] = { 0.0, 0.0, 0.0 };
 	bool bStereo;
 	virtual bool prepareSampleBuffer(unsigned int snum) = 0;
 };