[xcode] Support for ALAC encoded RSP/DAAP streaming

2023-12-05 21:47:44 +01:00 · 2023-12-05 21:47:44 +01:00 · 9749ded184
parent 85e9b06bca
commit 9749ded184
6 changed files with 153 additions and 68 deletions
--- a/owntone.conf.in
+++ b/owntone.conf.in
@ -187,19 +187,23 @@ library {
 	# Should we import the content of iTunes smart playlists?
 #	itunes_smartpl = false

-	# Decoding options for DAAP and RSP clients
+	# Transcoding options for DAAP and RSP clients
 	# Since iTunes has native support for mpeg, mp4a, mp4v, alac and wav,
-	# such files will be sent as they are. Any other formats will be decoded
-	# to raw wav. If OwnTone detects a non-iTunes DAAP client, it is
-	# assumed to only support mpeg and wav, other formats will be decoded.
-	# Here you can change when to decode. Note that these settings only
-	# affect serving media to DAAP and RSP clients, they have no effect on
+	# such files will be sent as they are. Any other formats will be
+	# transcoded. Some other clients, including Roku/RSP, announce what
+	# formats they support, and the server will transcode to one of those if
+	# necessary. Clients that don't announce supported formats are assumed
+	# to support mpeg (mp3), wav and alac.
+	# Here you can change when and how to transcode. The settings *only*
+	# affect serving audio to DAAP and RSP clients, they have no effect on
 	# direct AirPlay, Chromecast and local audio playback.
 	# Formats: mp4a, mp4v, mpeg, alac, flac, mpc, ogg, wma, wmal, wmav, aif, wav
-	# Formats that should never be decoded
+	# Formats that should never be transcoded
 #	no_decode = { "format", "format" }
-	# Formats that should always be decoded
+	# Formats that should always be transcoded
 #	force_decode = { "format", "format" }
+	# Prefer transcode to alac (default), wav or mpeg (mp3 at 320 kbps)
+#	prefer_format = "format"

 	# Set ffmpeg filters (similar to 'ffmpeg -af xxx') that you want the
 	# server to use when decoding files from your library. Examples:
--- a/src/conffile.c
+++ b/src/conffile.c
@ -111,6 +111,7 @@ static cfg_opt_t sec_library[] =
    CFG_BOOL("itunes_smartpl", cfg_false, CFGF_NONE),
    CFG_STR_LIST("no_decode", NULL, CFGF_NONE),
    CFG_STR_LIST("force_decode", NULL, CFGF_NONE),
+    CFG_STR("prefer_format", NULL, CFGF_NONE),
    CFG_BOOL("pipe_autostart", cfg_true, CFGF_NONE),
    CFG_INT("pipe_sample_rate", 44100, CFGF_NONE),
    CFG_INT("pipe_bits_per_sample", 16, CFGF_NONE),
--- a/src/httpd.c
+++ b/src/httpd.c
@ -119,6 +119,7 @@ static const struct content_type_map ext2ctype[] =
    { ".png",  XCODE_PNG,  "image/png" },
    { ".jpg",  XCODE_JPEG, "image/jpeg" },
    { ".mp3",  XCODE_MP3,  "audio/mpeg" },
+    { ".m4a",  XCODE_MP4,  "audio/mp4" },
    { ".wav",  XCODE_WAV,  "audio/wav" },
    { NULL,    XCODE_NONE, NULL }
  };
@ -678,7 +679,7 @@ stream_new_transcode(struct media_file_info *mfi, enum transcode_profile profile
                     int64_t offset, int64_t end_offset, event_callback_fn stream_cb)
 {
  struct stream_ctx *st;
-  struct media_quality quality = { HTTPD_STREAM_SAMPLE_RATE, HTTPD_STREAM_BPS, HTTPD_STREAM_CHANNELS, HTTPD_STREAM_BIT_RATE };
+//  struct media_quality quality = { HTTPD_STREAM_SAMPLE_RATE, HTTPD_STREAM_BPS, HTTPD_STREAM_CHANNELS, HTTPD_STREAM_BIT_RATE };

  st = stream_new(mfi, hreq, stream_cb);
  if (!st)
@ -686,7 +687,7 @@ stream_new_transcode(struct media_file_info *mfi, enum transcode_profile profile
      goto error;
    }

-  st->xcode = transcode_setup(profile, &quality, mfi->data_kind, mfi->path, mfi->song_length);
+  st->xcode = transcode_setup(profile, NULL, mfi->data_kind, mfi->path, mfi->song_length);
  if (!st->xcode)
    {
      DPRINTF(E_WARN, L_HTTPD, "Transcoding setup failed, aborting streaming\n");
--- a/src/httpd_rsp.c
+++ b/src/httpd_rsp.c
@ -710,6 +710,16 @@ rsp_stream(struct httpd_request *hreq)
 //  /rsp/stream/36364
 //  /rsp/db/0?query=id%3D36365&type=full
 //  /rsp/stream/36365
+//
+// Headers sent from Roku M2000 and M1001 in stream requests (and other?):
+//
+// 'User-Agent': 'Roku SoundBridge/3.0'
+// 'Host': '192.168.1.119:3689'
+// 'Accept': '*/*'
+// 'Pragma': 'no-cache'
+// 'accept-codecs': 'wma,mpeg,wav,mp4a,alac'
+// 'rsp-version': '0.1'
+// 'transcode-codecs': 'wav,mp3'
 static struct httpd_uri_map rsp_handlers[] =
  {
    {
--- a/src/transcode.c
+++ b/src/transcode.c
@ -63,8 +63,10 @@
 #define WAV_HEADER_LEN 44
 // Max filters in a filtergraph
 #define MAX_FILTERS 9
+// Set to same size as in httpd.c (but can be set to something else)
+#define STREAM_CHUNK_SIZE (64 * 1024)

-static const char *default_codecs = "mpeg,wav";
+static const char *default_codecs = "mpeg,alac,wav";
 static const char *roku_codecs = "mpeg,mp4a,wma,alac,wav";
 static const char *itunes_codecs = "mpeg,mp4a,mp4v,alac,wav";

@ -93,8 +95,8 @@ struct settings_ctx
  AVChannelLayout channel_layout;
 #else
  uint64_t channel_layout;
-  int channels;
 #endif
+  int nb_channels;
  int bit_rate;
  int frame_size;
  enum AVSampleFormat sample_format;
@ -289,6 +291,12 @@ init_settings(struct settings_ctx *settings, enum transcode_profile profile, str
 	settings->frame_size = 352;
 	break;

+      case XCODE_MP4:
+	settings->encode_audio = true;
+	settings->format = "mp4";
+	settings->audio_codec = AV_CODEC_ID_ALAC;
+	break;
+
      case XCODE_OGG:
 	settings->encode_audio = true;
 	settings->in_format = "ogg";
@ -371,6 +379,65 @@ init_settings(struct settings_ctx *settings, enum transcode_profile profile, str
  return 0;
 }

+static int
+init_settings_from_video(struct settings_ctx *settings, enum transcode_profile profile, struct decode_ctx *src_ctx, int width, int height)
+{
+  settings->width = width;
+  settings->height = height;
+
+  return 0;
+}
+
+static int
+init_settings_from_audio(struct settings_ctx *settings, enum transcode_profile profile, struct decode_ctx *src_ctx, struct media_quality *quality)
+{
+  int src_bytes_per_sample = av_get_bytes_per_sample(src_ctx->audio_stream.codec->sample_fmt);
+
+  // Initialize unset settings that are source-dependent, not profile-dependent
+  if (!settings->sample_rate)
+    settings->sample_rate = src_ctx->audio_stream.codec->sample_rate;
+
+#if USE_CH_LAYOUT
+  if (!av_channel_layout_check(&settings->channel_layout))
+    av_channel_layout_copy(&settings->channel_layout, &src_ctx->audio_stream.codec->ch_layout);
+
+  settings->nb_channels = settings->channel_layout.nb_channels;
+#else
+  if (settings->nb_channels == 0)
+    {
+      settings->nb_channels = src_ctx->audio_stream.codec->channels;
+      settings->channel_layout = src_ctx->audio_stream.codec->channel_layout;
+    }
+#endif
+
+  // Initialize settings that are both source-dependent and profile-dependent
+  switch (profile)
+    {
+      case XCODE_MP4:
+	if (!settings->sample_format)
+	  settings->sample_format = (src_bytes_per_sample == 4) ? AV_SAMPLE_FMT_S32P : AV_SAMPLE_FMT_S16P;
+	break;
+
+      case XCODE_PCM_NATIVE:
+	if (!settings->sample_format)
+	  settings->sample_format = (src_bytes_per_sample == 4) ? AV_SAMPLE_FMT_S32 : AV_SAMPLE_FMT_S16;
+	if (!settings->audio_codec)
+	  settings->audio_codec = (src_bytes_per_sample == 4) ? AV_CODEC_ID_PCM_S32LE : AV_CODEC_ID_PCM_S16LE;
+	if (!settings->format)
+	  settings->format = (src_bytes_per_sample == 4) ? "s32le" : "s16le";
+	break;
+
+      default:
+	if (settings->sample_format && settings->audio_codec && settings->format)
+	  return 0;
+
+	DPRINTF(E_LOG, L_XCODE, "Bug! Profile %d has unset encoding parameters\n", profile);
+	return -1;
+    }
+
+  return 0;
+}
+
 static void
 stream_settings_set(struct stream_ctx *s, struct settings_ctx *settings, enum AVMediaType type)
 {
@ -474,6 +541,8 @@ size_estimate(enum transcode_profile profile, int bit_rate, int sample_rate, int
    bytes = (int64_t)len_ms * channels * bytes_per_sample * sample_rate / 1000 + WAV_HEADER_LEN;
  else if (profile == XCODE_MP3)
    bytes = (int64_t)len_ms * bit_rate / 8000;
+  else if (profile == XCODE_MP4)
+    bytes = (int64_t)len_ms * channels * bytes_per_sample * sample_rate / 1000 / 2; // FIXME
  else
    bytes = -1;

@ -1187,6 +1256,7 @@ open_output(struct encode_ctx *ctx, struct decode_ctx *src_ctx)
  // Not const before ffmpeg 5.0
  AVOutputFormat *oformat;
 #endif
+  AVDictionary *options = NULL;
  int ret;

  oformat = av_guess_format(ctx->settings.format, NULL, NULL);
@ -1236,14 +1306,30 @@ open_output(struct encode_ctx *ctx, struct decode_ctx *src_ctx)
 	goto out_free_streams;
    }

+  // By default ffmpeg can't mux mp4 to a stream, since it is non-seekable, and
+  // normally the muxing involves writing some header bytes when the encoding is
+  // completed. This is solution for that found on stackoverflow. "movflags" set
+  // to "empty_moov" was also suggested, but it doesn't seem required.
+  if (strcmp("mp4", oformat->name) == 0)
+    {
+      av_dict_set_int(&options, "frag_size", STREAM_CHUNK_SIZE, 0);
+//      av_dict_set(&options, "movflags", "empty_moov", 0);
+    }
+
  // Notice, this will not write WAV header (so we do that manually)
-  ret = avformat_write_header(ctx->ofmt_ctx, NULL);
+  ret = avformat_write_header(ctx->ofmt_ctx, &options);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Error writing header to output buffer: %s\n", err2str(ret));
      goto out_free_streams;
    }

+  if (options)
+    {
+      DPRINTF(E_WARN, L_XCODE, "Didn't recognize all options given to avformat_write_header\n");
+      av_dict_free(&options);
+    }
+
  if (ctx->settings.with_wav_header)
    {
      evbuffer_add(ctx->obuf, ctx->wav_header, sizeof(ctx->wav_header));
@ -1631,71 +1717,30 @@ struct encode_ctx *
 transcode_encode_setup(enum transcode_profile profile, struct media_quality *quality, struct decode_ctx *src_ctx, int width, int height)
 {
  struct encode_ctx *ctx;
-  int src_bytes_per_sample;
  int dst_bytes_per_sample;
-  int channels;

  CHECK_NULL(L_XCODE, ctx = calloc(1, sizeof(struct encode_ctx)));
  CHECK_NULL(L_XCODE, ctx->filt_frame = av_frame_alloc());
  CHECK_NULL(L_XCODE, ctx->encoded_pkt = av_packet_alloc());

+  // Initialize general settings
  if (init_settings(&ctx->settings, profile, quality) < 0)
    goto fail_free;

-  ctx->settings.width = width;
-  ctx->settings.height = height;
+  if (ctx->settings.encode_audio && init_settings_from_audio(&ctx->settings, profile, src_ctx, quality) < 0)
+    goto fail_free;

-  // Caller did not specify a sample rate -> use same as source
-  if (!ctx->settings.sample_rate && ctx->settings.encode_audio)
-    {
-      ctx->settings.sample_rate = src_ctx->audio_stream.codec->sample_rate;
-    }
-
-  // Caller did not specify a sample format -> determine from source
-  if (!ctx->settings.sample_format && ctx->settings.encode_audio)
-    {
-      src_bytes_per_sample = av_get_bytes_per_sample(src_ctx->audio_stream.codec->sample_fmt);
-      if (src_bytes_per_sample == 4)
-	{
-	  ctx->settings.sample_format = AV_SAMPLE_FMT_S32;
-	  ctx->settings.audio_codec = AV_CODEC_ID_PCM_S32LE;
-	  ctx->settings.format = "s32le";
-	}
-      else
-	{
-	  ctx->settings.sample_format = AV_SAMPLE_FMT_S16;
-	  ctx->settings.audio_codec = AV_CODEC_ID_PCM_S16LE;
-	  ctx->settings.format = "s16le";
-	}
-    }
-
-#if USE_CH_LAYOUT
-  // Caller did not specify channels -> use same as source
-  if (!av_channel_layout_check(&ctx->settings.channel_layout) && ctx->settings.encode_audio)
-    {
-      av_channel_layout_copy(&ctx->settings.channel_layout, &src_ctx->audio_stream.codec->ch_layout);
-    }
-
-  channels = ctx->settings.channel_layout.nb_channels;
-#else
-  // Caller did not specify channels -> use same as source
-  if (ctx->settings.channels == 0 && ctx->settings.encode_audio)
-    {
-      ctx->settings.channels = src_ctx->audio_stream.codec->channels;
-      ctx->settings.channel_layout = src_ctx->audio_stream.codec->channel_layout;
-    }
-
-  channels = ctx->settings.channels;
-#endif
+  if (ctx->settings.encode_video && init_settings_from_video(&ctx->settings, profile, src_ctx, width, height) < 0)
+    goto fail_free;

  dst_bytes_per_sample = av_get_bytes_per_sample(ctx->settings.sample_format);

-  ctx->bytes_total = size_estimate(profile, ctx->settings.bit_rate, ctx->settings.sample_rate, dst_bytes_per_sample, channels, src_ctx->len_ms);
+  ctx->bytes_total = size_estimate(profile, ctx->settings.bit_rate, ctx->settings.sample_rate, dst_bytes_per_sample, ctx->settings.nb_channels, src_ctx->len_ms);

  if (ctx->settings.with_wav_header)
-    make_wav_header(ctx->wav_header, ctx->settings.sample_rate, dst_bytes_per_sample, channels, ctx->bytes_total);
+    make_wav_header(ctx->wav_header, ctx->settings.sample_rate, dst_bytes_per_sample, ctx->settings.nb_channels, ctx->bytes_total);
  if (ctx->settings.with_icy && src_ctx->data_kind == DATA_KIND_HTTP)
-    ctx->icy_interval = METADATA_ICY_INTERVAL * channels * dst_bytes_per_sample * ctx->settings.sample_rate;
+    ctx->icy_interval = METADATA_ICY_INTERVAL * ctx->settings.nb_channels * dst_bytes_per_sample * ctx->settings.sample_rate;

  if (open_output(ctx, src_ctx) < 0)
    goto fail_free;
@ -1804,9 +1849,13 @@ transcode_decode_setup_raw(enum transcode_profile profile, struct media_quality
 enum transcode_profile
 transcode_needed(const char *user_agent, const char *client_codecs, char *file_codectype)
 {
-  char *codectype;
+  const char *codectype;
+  const char *prefer_format;
  cfg_t *lib;
  bool force_xcode;
+  bool supports_alac;
+  bool supports_mpeg;
+  bool supports_wav;
  int count;
  int i;

@ -1862,10 +1911,28 @@ transcode_needed(const char *user_agent, const char *client_codecs, char *file_c

  if (!force_xcode && strstr(client_codecs, file_codectype))
    return XCODE_NONE;
-  else if (strstr(client_codecs, "mpeg"))
-    return XCODE_MP3;
-  else if (strstr(client_codecs, "wav"))
+
+  supports_alac = strstr(client_codecs, "alac") || strstr(client_codecs, "mp4a");
+  supports_mpeg = strstr(client_codecs, "mpeg");
+  supports_wav = strstr(client_codecs, "wav");
+
+  prefer_format = cfg_getstr(lib, "prefer_format");
+  if (prefer_format)
+    {
+      if (strcmp(prefer_format, "alac") == 0 && supports_alac)
+	return XCODE_MP4;
+      else if (strcmp(prefer_format, "wav") == 0 && supports_wav)
+	return XCODE_WAV;
+      else if (strcmp(prefer_format, "mpeg") == 0 && supports_mpeg)
+	return XCODE_MP3;
+    }
+
+  if (supports_alac)
+    return XCODE_MP4;
+  else if (supports_wav)
    return XCODE_WAV;
+  else if (supports_mpeg)
+    return XCODE_MP3;
  else
    return XCODE_UNKNOWN;
 }
--- a/src/transcode.h
+++ b/src/transcode.h
@ -23,10 +23,12 @@ enum transcode_profile
  XCODE_PCM32,
  // Transcodes the best audio stream to MP3
  XCODE_MP3,
-  // Transcodes the best audio stream to OPUS
+  // Transcodes the best audio stream to raw OPUS (no container)
  XCODE_OPUS,
-  // Transcodes the best audio stream to ALAC
+  // Transcodes the best audio stream to raw ALAC (no container)
  XCODE_ALAC,
+  // Transcodes the best audio stream to ALAC in a MP4 container
+  XCODE_MP4,
  // Transcodes the best audio stream from OGG
  XCODE_OGG,
  // Transcodes the best video stream to JPEG/PNG/VP8