Split up audiowmark.cc into separate files.

- wmadd.cc - code to generate the watermark - wmget.cc - code to detect the watermark - wmcommon.cc/hh - code shared by add/get - audiowmark.cc - remaining code Signed-off-by: Stefan Westerfeld <stefan@space.twc.de>

Split up audiowmark.cc into separate files.
- wmadd.cc - code to generate the watermark - wmget.cc - code to detect the watermark - wmcommon.cc/hh - code shared by add/get - audiowmark.cc - remaining code Signed-off-by: Stefan Westerfeld <stefan@space.twc.de>
f1c24d9c · Stefan Westerfeld · fb503349 · f1c24d9c · f1c24d9c · f1c24d9c
Commit f1c24d9c authored Dec 09, 2019 by Stefan Westerfeld
7 changed files
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -6,7 +6,7 @@ COMMON_SRC = utils.hh utils.cc convcode.hh convcode.cc random.hh random.cc mp3.c
 	     rawconverter.cc rawconverter.hh mp3inputstream.cc mp3inputstream.hh
 COMMON_LIBS = $(SNDFILE_LIBS) $(FFTW_LIBS) $(LIBGCRYPT_LIBS) $(LIBMPG123_LIBS)

-audiowmark_SOURCES = audiowmark.cc fft.cc fft.hh $(COMMON_SRC)
+audiowmark_SOURCES = audiowmark.cc fft.cc fft.hh wmget.cc wmadd.cc wmcommon.hh wmcommon.cc $(COMMON_SRC)
 audiowmark_LDFLAGS = $(COMMON_LIBS)

 noinst_PROGRAMS = testconvcode testrandom testmp3 teststream

--- a/src/TODO
+++ b/src/TODO
@@ -5,4 +5,4 @@ streaming:
 - need explicit Error objects
 - logging
 - raw streams
- mp3 indexing
+- mp3 indexing / detect (resync-limit)
--- a/src/audiowmark.cc
+++ b/src/audiowmark.cc
@@ -2,25 +2,13 @@
 #include <math.h>
 #include <string>
 #include <random>
-#include <complex>
 #include <algorithm>
 #include <memory>

-#include "fft.hh"
 #include "wavdata.hh"
 #include "utils.hh"
-#include "convcode.hh"
 #include "random.hh"
-#include "sfinputstream.hh"
-#include "sfoutputstream.hh"
-#include "stdoutwavoutputstream.hh"
-#include "rawinputstream.hh"
-#include "rawoutputstream.hh"
-#include "mp3.hh"
-#include "mp3inputstream.hh"
-
-#include <zita-resampler/resampler.h>
-#include <zita-resampler/vresampler.h>
+#include "wmcommon.hh"

 #include <assert.h>

@@ -28,46 +16,9 @@

 using std::string;
 using std::vector;
-using std::complex;
 using std::min;
 using std::max;

-enum class Format { AUTO = 1, RAW = 2 };
-
-namespace Params
-{
-  static size_t frame_size      = 1024;
-  static int    frames_per_bit  = 2;
-  static size_t bands_per_frame = 30;
-  static int max_band          = 100;
-  static int min_band          = 20;
-  static double water_delta    = 0.01;  // strength of the watermark
-  static bool mix              = true;
-  static bool hard             = false; // hard decode bits? (soft decoding is better)
-  static bool snr              = false; // compute/show snr while adding watermark
-  static int have_key          = 0;
-  static size_t payload_size   = 128;  // number of payload bits for the watermark
-
-  static int sync_bits           = 6;
-  static int sync_frames_per_bit = 85;
-  static int sync_search_step    = 256;
-  static int sync_search_fine    = 8;
-  static double sync_threshold2  = 0.7; // minimum refined quality
-
-  static size_t frames_pad_start = 250; // padding at start, in case track starts with silence
-  static int    mark_sample_rate = 44100; // watermark generation and detection sample rate
-
-  static int test_cut            = 0; // for sync test
-  static bool test_no_sync       = false; // disable sync
-  static int test_truncate       = 0;
-
-  static Format input_format     = Format::AUTO;
-  static Format output_format    = Format::AUTO;
-
-  static RawFormat raw_input_format;
-  static RawFormat raw_output_format;
-}
-
 void
 print_usage()
 {
@@ -324,1620 +275,6 @@ parse_options (int   *argc_p,
 }


-inline double
-window_cos (double x) /* von Hann window */
-{
-  if (fabs (x) > 1)
-    return 0;
-  return 0.5 * cos (x * M_PI) + 0.5;
-}
-
-inline double
-window_hamming (double x) /* sharp (rectangle) cutoffs at boundaries */
-{
-  if (fabs (x) > 1)
-    return 0;
-
-  return 0.54 + 0.46 * cos (M_PI * x);
-}
-
-/*
- * glibc log2f is a lot faster than glibc log10
- */
-inline double
-fast_log10 (double l)
-{
-  constexpr double log2_log10_factor = 0.3010299956639811952; // 1 / log2 (10)
-
-  return log2f (l) * log2_log10_factor;
-}
-
-double
-db_from_factor (double factor, double min_dB)
-{
-  if (factor > 0)
-    {
-      double dB = fast_log10 (factor); /* Bell */
-      dB *= 20;
-      return dB;
-    }
-  else
-    return min_dB;
-}
-
-int
-frame_count (const WavData& wav_data)
-{
-  return wav_data.n_values() / wav_data.n_channels() / Params::frame_size;
-}
-
-typedef std::array<int, 30> UpDownArray;
-class UpDownGen
-{
-  Random::Stream random_stream;
-  Random         random;
-  vector<int>    bands_reorder;
-
-public:
-  UpDownGen (Random::Stream random_stream) :
-    random_stream (random_stream),
-    random (0, random_stream),
-    bands_reorder (Params::max_band - Params::min_band + 1)
-  {
-    UpDownArray x;
-    assert (x.size() == Params::bands_per_frame);
-  }
-  void
-  get (int f, UpDownArray& up, UpDownArray& down)
-  {
-    for (size_t i = 0; i < bands_reorder.size(); i++)
-      bands_reorder[i] = Params::min_band + i;
-
-    random.seed (f, random_stream); // use per frame random seed
-    random.shuffle (bands_reorder);
-
-    assert (2 * Params::bands_per_frame < bands_reorder.size());
-    for (size_t i = 0; i < Params::bands_per_frame; i++)
-      {
-        up[i]   = bands_reorder[i];
-        down[i] = bands_reorder[Params::bands_per_frame + i];
-      }
-  }
-};
-
-template<class T> vector<T>
-randomize_bit_order (const vector<T>& bit_vec, bool encode)
-{
-  vector<unsigned int> order;
-
-  for (size_t i = 0; i < bit_vec.size(); i++)
-    order.push_back (i);
-
-  Random random (/* seed */ 0, Random::Stream::bit_order);
-  random.shuffle (order);
-
-  vector<T> out_bits (bit_vec.size());
-  for (size_t i = 0; i < bit_vec.size(); i++)
-    {
-      if (encode)
-        out_bits[i] = bit_vec[order[i]];
-      else
-        out_bits[order[i]] = bit_vec[i];
-    }
-  return out_bits;
-}
-
-class FFTAnalyzer
-{
-  int           m_n_channels = 0;
-  vector<float> m_window;
-  float        *m_frame = nullptr;
-  float        *m_frame_fft = nullptr;
-public:
-  FFTAnalyzer (int n_channels) :
-    m_n_channels (n_channels)
-  {
-    /* generate analysis window */
-    m_window.resize (Params::frame_size);
-
-    double window_weight = 0;
-    for (size_t i = 0; i < Params::frame_size; i++)
-      {
-        const double fsize_2 = Params::frame_size / 2.0;
-        // const double win =  window_cos ((i - fsize_2) / fsize_2);
-        const double win = window_hamming ((i - fsize_2) / fsize_2);
-        //const double win = 1;
-        m_window[i] = win;
-        window_weight += win;
-      }
-
-    /* normalize window using window weight */
-    for (size_t i = 0; i < Params::frame_size; i++)
-      {
-        m_window[i] *= 2.0 / window_weight;
-      }
-
-    /* allocate properly aligned buffers for SIMD */
-    m_frame  = new_array_float (Params::frame_size);
-    m_frame_fft = new_array_float (Params::frame_size);
-  }
-  ~FFTAnalyzer()
-  {
-    free_array_float (m_frame);
-    free_array_float (m_frame_fft);
-  }
-  vector<vector<complex<float>>>
-  run_fft (const vector<float>& samples, size_t start_index)
-  {
-    assert (samples.size() >= (Params::frame_size + start_index) * m_n_channels);
-
-    vector<vector<complex<float>>> fft_out;
-    for (int ch = 0; ch < m_n_channels; ch++)
-      {
-        size_t pos = start_index * m_n_channels + ch;
-        assert (pos + (Params::frame_size - 1) * m_n_channels < samples.size());
-
-        /* deinterleave frame data and apply window */
-        for (size_t x = 0; x < Params::frame_size; x++)
-          {
-            m_frame[x] = samples[pos] * m_window[x];
-            pos += m_n_channels;
-          }
-        /* FFT transform */
-        fftar_float (Params::frame_size, m_frame, m_frame_fft);
-
-        /* complex<float> and frame_fft have the same layout in memory */
-        const complex<float> *first = (complex<float> *) m_frame_fft;
-        const complex<float> *last  = first + Params::frame_size / 2 + 1;
-        fft_out.emplace_back (first, last);
-      }
-
-    return fft_out;
-  }
-  vector<vector<complex<float>>>
-  fft_range (const vector<float>& samples, size_t start_index, size_t frame_count)
-  {
-    vector<vector<complex<float>>> fft_out;
-
-    /* if there is not enough space for frame_count values, return an error (empty vector) */
-    if (samples.size() < (start_index + frame_count * Params::frame_size) * m_n_channels)
-      return fft_out;
-
-    for (size_t f = 0; f < frame_count; f++)
-      {
-        const size_t frame_start = (f * Params::frame_size) + start_index;
-
-        vector<vector<complex<float>>> frame_result = run_fft (samples, frame_start);
-        for (auto& fr : frame_result)
-          fft_out.emplace_back (std::move (fr));
-      }
-    return fft_out;
-  }
-};
-
-size_t mark_data_frame_count();
-size_t mark_sync_frame_count();
-
-int
-frame_pos (int f, bool sync)
-{
-  static vector<int> pos_vec;
-
-  if (pos_vec.empty())
-    {
-      int frame_count = mark_data_frame_count() + mark_sync_frame_count();
-      for (int i = 0; i < frame_count; i++)
-        pos_vec.push_back (i);
-
-      Random random (0, Random::Stream::frame_position);
-      random.shuffle (pos_vec);
-    }
-  if (sync)
-    {
-      assert (f >= 0 && size_t (f) < mark_sync_frame_count());
-
-      return pos_vec[f];
-    }
-  else
-    {
-      assert (f >= 0 && size_t (f) < mark_data_frame_count());
-
-      return pos_vec[f + mark_sync_frame_count()];
-    }
-}
-
-int
-sync_frame_pos (int f)
-{
-  return frame_pos (f, true);
-}
-
-int
-data_frame_pos (int f)
-{
-  return frame_pos (f, false);
-}
-
-size_t
-mark_data_frame_count()
-{
-  return conv_code_size (ConvBlockType::a, Params::payload_size) * Params::frames_per_bit;
-}
-
-struct MixEntry
-{
-  int  frame;
-  int  up;
-  int  down;
-};
-
-vector<MixEntry>
-gen_mix_entries()
-{
-  const int frame_count = mark_data_frame_count();
-  vector<MixEntry> mix_entries (frame_count * Params::bands_per_frame);
-
-  UpDownGen up_down_gen (Random::Stream::data_up_down);
-  int entry = 0;
-  for (int f = 0; f < frame_count; f++)
-    {
-      const int index = data_frame_pos (f);
-      UpDownArray up, down;
-      up_down_gen.get (f, up, down);
-
-      assert (up.size() == down.size());
-      for (size_t i = 0; i < up.size(); i++)
-        mix_entries[entry++] = { index, up[i], down[i] };
-    }
-  Random random (/* seed */ 0, Random::Stream::mix);
-  random.shuffle (mix_entries);
-
-  return mix_entries;
-}
-
-enum class FrameMod : uint8_t {
-  KEEP = 0,
-  UP,
-  DOWN
-};
-
-void
-prepare_frame_mod (UpDownGen& up_down_gen, int f, vector<FrameMod>& frame_mod, int data_bit)
-{
-  UpDownArray up, down;
-  up_down_gen.get (f, up, down);
-  for (auto u : up)
-    frame_mod[u] = data_bit ? FrameMod::UP : FrameMod::DOWN;
-
-  for (auto d : down)
-    frame_mod[d] = data_bit ? FrameMod::DOWN : FrameMod::UP;
-}
-
-void
-apply_frame_mod (const vector<FrameMod>& frame_mod, const vector<complex<float>>& fft_out, vector<complex<float>>& fft_delta_spect)
-{
-  const float   min_mag = 1e-7;   // avoid computing pow (0.0, -water_delta) which would be inf
-  for (size_t i = 0; i < frame_mod.size(); i++)
-    {
-      if (frame_mod[i] == FrameMod::KEEP)
-        continue;
-
-      int data_bit_sign = (frame_mod[i] == FrameMod::UP) ? 1 : -1;
-      /*
-       * for up bands, we want do use [for a 1 bit]  (pow (mag, 1 - water_delta))
-       *
-       * this actually increases the amount of energy because mag is less than 1.0
-       */
-      const float mag = abs (fft_out[i]);
-      if (mag > min_mag)
-        {
-          const float mag_factor = powf (mag, -Params::water_delta * data_bit_sign);
-
-          fft_delta_spect[i] = fft_out[i] * (mag_factor - 1);
-        }
-    }
-}
-
-void
-mark_data (vector<vector<FrameMod>>& frame_mod, const vector<int>& bitvec)
-{
-  assert (bitvec.size() == mark_data_frame_count() / Params::frames_per_bit);
-  assert (frame_mod.size() >= mark_data_frame_count());
-
-  const int frame_count = mark_data_frame_count();
-
-  if (Params::mix)
-    {
-      vector<MixEntry> mix_entries = gen_mix_entries();
-
-      for (int f = 0; f < frame_count; f++)
-        {
-          for (size_t frame_b = 0; frame_b < Params::bands_per_frame; frame_b++)
-            {
-              int b = f * Params::bands_per_frame + frame_b;
-
-              const int data_bit = bitvec[f / Params::frames_per_bit];
-
-              const int u = mix_entries[b].up;
-              const int d = mix_entries[b].down;
-              const int index = mix_entries[b].frame;
-
-              frame_mod[index][u] = data_bit ? FrameMod::UP : FrameMod::DOWN;
-              frame_mod[index][d] = data_bit ? FrameMod::DOWN : FrameMod::UP;
-            }
-        }
-    }
-  else
-    {
-      UpDownGen up_down_gen (Random::Stream::data_up_down);
-      // sync block always written in linear order (no mix)
-      for (int f = 0; f < frame_count; f++)
-        {
-          size_t index = data_frame_pos (f);
-
-          prepare_frame_mod (up_down_gen, f, frame_mod[index], bitvec[f / Params::frames_per_bit]);
-        }
-    }
-}
-
-size_t
-mark_sync_frame_count()
-{
-  return Params::sync_bits * Params::sync_frames_per_bit;
-}
-
-void
-mark_sync (vector<vector<FrameMod>>& frame_mod, int ab)
-{
-  const int frame_count = mark_sync_frame_count();
-  assert (frame_mod.size() >= mark_sync_frame_count());
-
-  UpDownGen up_down_gen (Random::Stream::sync_up_down);
-
-  // sync block always written in linear order (no mix)
-  for (int f = 0; f < frame_count; f++)
-    {
-      size_t index = sync_frame_pos (f);
-      int    data_bit = (f / Params::sync_frames_per_bit + ab) & 1; /* write 010101 for a block, 101010 for b block */
-
-      prepare_frame_mod (up_down_gen, f, frame_mod[index], data_bit);
-    }
-}
-
-void
-init_pad_mod_vec (vector<vector<FrameMod>>& pad_mod_vec)
-{
-  UpDownGen up_down_gen (Random::Stream::pad_up_down);
-
-  for (size_t f = 0; f < Params::frames_pad_start; f++)
-    {
-      vector<FrameMod> mod (Params::max_band + 1);
-
-      prepare_frame_mod (up_down_gen, f, mod, 0);
-      pad_mod_vec.push_back (mod);
-    }
-}
-
-void
-init_frame_mod_vec (vector<vector<FrameMod>>& frame_mod_vec, int ab, const vector<int>& bitvec)
-{
-  frame_mod_vec.resize (mark_sync_frame_count() + mark_data_frame_count());
-
-  for (auto& frame_mod : frame_mod_vec)
-    frame_mod.resize (Params::max_band + 1);
-
-  mark_sync (frame_mod_vec, ab);
-  mark_data (frame_mod_vec, bitvec);
-}
-
-template<class R>
-static void
-process_resampler (R& resampler, const vector<float>& in, vector<float>& out)
-{
-  resampler.out_count = out.size() / resampler.nchan();
-  resampler.out_data = &out[0];
-
-  /* avoid timeshift: zita needs k/2 - 1 samples before the actual input */
-  resampler.inp_count = resampler.inpsize () / 2 - 1;
-  resampler.inp_data  = nullptr;
-  resampler.process();
-
-  resampler.inp_count = in.size() / resampler.nchan();
-  resampler.inp_data = (float *) &in[0];
-  resampler.process();
-
-  /* zita needs k/2 samples after the actual input */
-  resampler.inp_count = resampler.inpsize() / 2;
-  resampler.inp_data  = nullptr;
-  resampler.process();
-}
-
-WavData
-resample (const WavData& wav_data, int rate)
-{
-  /* in our application, resampling should only be called if it is necessary
-   * since using the resampler with input rate == output rate would be slow
-   */
-  assert (rate != wav_data.sample_rate());
-
-  const int hlen = 16;
-  const double ratio = double (rate) / wav_data.sample_rate();
-
-  const vector<float>& in = wav_data.samples();
-  vector<float> out (lrint (in.size() / wav_data.n_channels() * ratio) * wav_data.n_channels());
-
-  /* zita-resampler provides two resampling algorithms
-   *
-   * a fast optimized version: Resampler
-   *   this is an optimized version, which works for many common cases,
-   *   like resampling between 22050, 32000, 44100, 48000, 96000 Hz
-   *
-   * a slower version: VResampler
-   *   this works for arbitary rates (like 33333 -> 44100 resampling)
-   *
-   * so we try using Resampler, and if that fails fall back to VResampler
-   */
-  Resampler resampler;
-  if (resampler.setup (wav_data.sample_rate(), rate, wav_data.n_channels(), hlen) == 0)
-    {
-      process_resampler (resampler, in, out);
-      return WavData (out, wav_data.n_channels(), rate, wav_data.bit_depth());
-    }
-
-  VResampler vresampler;
-  if (vresampler.setup (ratio, wav_data.n_channels(), hlen) == 0)
-    {
-      process_resampler (vresampler, in, out);
-      return WavData (out, wav_data.n_channels(), rate, wav_data.bit_depth());
-    }
-  fprintf (stderr, "audiowmark: resampling from rate %d to rate %d not supported.\n", wav_data.sample_rate(), rate);
-  exit (1);
-}
-
-/* synthesizes a watermark stream (overlap add with synthesis window)
- *
- * input:  per-channel fft delta values (always one frame)
- * output: samples
- */
-class WatermarkSynth
-{
-  const int     n_channels = 0;
-  vector<float> window;
-  vector<float> synth_samples;
-  bool          first_frame = true;
-
-  void
-  generate_window()
-  {
-    window.resize (Params::frame_size * 3);
-    for (size_t i = 0; i < window.size(); i++)
-      {
-        const double overlap = 0.1;
-
-        // triangular basic window
-        double tri;
-        double norm_pos = (double (i) - Params::frame_size) / Params::frame_size;
-
-        if (norm_pos > 0.5) /* symmetric window */
-          norm_pos = 1 - norm_pos;
-        if (norm_pos < -overlap)
-          {
-            tri = 0;
-          }
-        else if (norm_pos < overlap)
-          {
-            tri = 0.5 + norm_pos / (2 * overlap);
-          }
-        else
-          {
-            tri = 1;
-          }
-        // cosine
-        window[i] = (cos (tri*M_PI+M_PI)+1) * 0.5;
-      }
-  }
-public:
-  WatermarkSynth (int n_channels) :
-    n_channels (n_channels)
-  {
-    generate_window();
-    synth_samples.resize (window.size() * n_channels);
-  }
-  vector<float>
-  run (const vector<vector<complex<float>>>& fft_delta_spect)
-  {
-    const size_t synth_frame_sz = Params::frame_size * n_channels;
-    /* move frame 1 and frame 2 to frame 0 and frame 1 */
-    std::copy (&synth_samples[synth_frame_sz], &synth_samples[synth_frame_sz * 3], &synth_samples[0]);
-    /* zero out frame 2 */
-    std::fill (&synth_samples[synth_frame_sz * 2], &synth_samples[synth_frame_sz * 3], 0);
-    for (int ch = 0; ch < n_channels; ch++)
-      {
-        /* mix watermark signal to output frame */
-        vector<float> fft_delta_out = ifft (fft_delta_spect[ch]);
-
-        for (int dframe = 0; dframe <= 2; dframe++)
-          {
-            const int wstart = dframe * Params::frame_size;
-
-            int pos = dframe * Params::frame_size * n_channels + ch;
-            for (size_t x = 0; x < Params::frame_size; x++)
-              {
-                synth_samples[pos] += fft_delta_out[x] * window[wstart + x];
-                pos += n_channels;
-              }
-          }
-      }
-    if (first_frame)
-      {
-        first_frame = false;
-        return {};
-      }
-    else
-      {
-        vector<float> out_samples (synth_samples.begin(), synth_samples.begin() + Params::frame_size * n_channels);
-        return out_samples;
-      }
-  }
-};
-
-/* generates a watermark signal
- *
- * input:  original signal samples (always for one complete frame)
- * output: watermark signal (to be mixed to the original sample)
- */
-class WatermarkGen
-{
-  enum State { PAD, WATERMARK } state = State::PAD;
-
-  const int                 n_channels = 0;
-  int                       frame_number = 0;
-  int                       frame_bound = Params::frames_pad_start;
-  int                       ab = 0;
-
-  FFTAnalyzer               fft_analyzer;
-  WatermarkSynth            wm_synth;
-
-  vector<int>               bitvec_a;
-  vector<int>               bitvec_b;
-  vector<vector<FrameMod>>  pad_mod_vec;
-  vector<vector<FrameMod>>  frame_mod_vec_a;
-  vector<vector<FrameMod>>  frame_mod_vec_b;
-public:
-  WatermarkGen (int n_channels, const vector<int>& bitvec_a, const vector<int>& bitvec_b) :
-    n_channels (n_channels),
-    fft_analyzer (n_channels),
-    wm_synth (n_channels),
-    bitvec_a (bitvec_a),
-    bitvec_b (bitvec_b)
-  {
-    init_pad_mod_vec (pad_mod_vec);
-    init_frame_mod_vec (frame_mod_vec_a, 0, bitvec_a);
-  }
-  vector<float>
-  run (const vector<float>& samples)
-  {
-    assert (samples.size() == Params::frame_size * n_channels);
-
-    vector<vector<complex<float>>> fft_out = fft_analyzer.run_fft (samples, 0);
-
-    vector<vector<complex<float>>> fft_delta_spect;
-    for (int ch = 0; ch < n_channels; ch++)
-      fft_delta_spect.push_back (vector<complex<float>> (fft_out.back().size()));
-
-    if (state == State::PAD)
-      {
-        for (int ch = 0; ch < n_channels; ch++)
-          apply_frame_mod (pad_mod_vec[frame_number], fft_out[ch], fft_delta_spect[ch]);
-      }
-    else if (state == State::WATERMARK)
-      {
-        for (int ch = 0; ch < n_channels; ch++)
-          apply_frame_mod (ab ? frame_mod_vec_b[frame_number] : frame_mod_vec_a[frame_number], fft_out[ch], fft_delta_spect[ch]);
-      }
-
-    frame_number++;
-    if (frame_number == frame_bound)
-      {
-        frame_number = 0;
-
-        if (state == PAD)
-          {
-            state = WATERMARK;
-            frame_bound = mark_sync_frame_count() + mark_data_frame_count();
-          }
-        else if (state == WATERMARK)
-          {
-            ab = (ab + 1) & 1; // write A|B|A|B|...
-            frame_bound = mark_sync_frame_count() + mark_data_frame_count();
-
-            if (frame_mod_vec_b.empty())
-              {
-                // we initialize this only when we need it to minimize startup latency
-                init_frame_mod_vec (frame_mod_vec_b, 1, bitvec_b);
-              }
-          }
-      }
-    return wm_synth.run (fft_delta_spect);
-  }
-};
-
-class AudioBuffer
-{
-  const int     n_channels = 0;
-  vector<float> buffer;
-
-public:
-  AudioBuffer (int n_channels) :
-    n_channels (n_channels)
-  {
-  }
-  void
-  write_frames (const vector<float>& samples)
-  {
-    buffer.insert (buffer.end(), samples.begin(), samples.end());
-  }
-  vector<float>
-  read_frames (size_t frames)
-  {
-    assert (frames * n_channels <= buffer.size());
-    const auto begin = buffer.begin();
-    const auto end   = begin + frames * n_channels;
-    vector<float> result (begin, end);
-    buffer.erase (begin, end);
-    return result;
-  }
-  size_t
-  can_read_frames() const
-  {
-    return buffer.size() / n_channels;
-  }
-};
-
-class ResamplerImpl
-{
-public:
-  virtual
-  ~ResamplerImpl()
-  {
-  }
-
-  virtual void          write_frames (const vector<float>& frames) = 0;
-  virtual vector<float> read_frames (size_t frames) = 0;
-  virtual size_t        can_read_frames() const = 0;
-};
-
-template<class Resampler>
-class BufferedResamplerImpl : public ResamplerImpl
-{
-  const int     n_channels = 0;
-  bool          first_write = true;
-  Resampler     m_resampler;
-
-  vector<float> buffer;
-public:
-  BufferedResamplerImpl (int n_channels) :
-    n_channels (n_channels)
-  {
-  }
-  Resampler&
-  resampler()
-  {
-    return m_resampler;
-  }
-  void
-  write_frames (const vector<float>& frames)
-  {
-    if (first_write)
-      {
-        /* avoid timeshift: zita needs k/2 - 1 samples before the actual input */
-        m_resampler.inp_count = m_resampler.inpsize () / 2 - 1;
-        m_resampler.inp_data  = nullptr;
-
-        m_resampler.out_count = 1000000; // <- just needs to be large enough that all input is consumed
-        m_resampler.out_data  = nullptr;
-        m_resampler.process();
-
-        first_write = false;
-      }
-
-    uint start = 0;
-    do
-      {
-        const int out_count = Params::frame_size;
-        float out[out_count * n_channels];
-
-        m_resampler.out_count = out_count;
-        m_resampler.out_data  = out;
-
-        m_resampler.inp_count = frames.size() / n_channels - start;
-        m_resampler.inp_data  = const_cast<float *> (&frames[start * n_channels]);
-        m_resampler.process();
-
-        size_t count = out_count - m_resampler.out_count;
-        buffer.insert (buffer.end(), out, out + count * n_channels);
-
-        start += frames.size() / n_channels - start - m_resampler.inp_count;
-      }
-    while (start != frames.size() / n_channels);
-  }
-  vector<float>
-  read_frames (size_t frames)
-  {
-    assert (frames * n_channels <= buffer.size());
-    const auto begin = buffer.begin();
-    const auto end   = begin + frames * n_channels;
-    vector<float> result (begin, end);
-    buffer.erase (begin, end);
-    return result;
-  }
-  size_t
-  can_read_frames() const
-  {
-    return buffer.size() / n_channels;
-  }
-};
-
-ResamplerImpl *
-create_resampler (int n_channels, int old_rate, int new_rate)
-{
-  if (old_rate == new_rate)
-    {
-      return nullptr; // should not be using create_resampler for that case
-    }
-  else
-    {
-      /* zita-resampler provides two resampling algorithms
-       *
-       * a fast optimized version: Resampler
-       *   this is an optimized version, which works for many common cases,
-       *   like resampling between 22050, 32000, 44100, 48000, 96000 Hz
-       *
-       * a slower version: VResampler
-       *   this works for arbitary rates (like 33333 -> 44100 resampling)
-       *
-       * so we try using Resampler, and if that fails fall back to VResampler
-       */
-      const int hlen = 16;
-
-      auto resampler = new BufferedResamplerImpl<Resampler> (n_channels);
-      if (resampler->resampler().setup (old_rate, new_rate, n_channels, hlen) == 0)
-        {
-          return resampler;
-        }
-      else
-        delete resampler;
-
-      auto vresampler = new BufferedResamplerImpl<VResampler> (n_channels);
-      const double ratio = double (new_rate) / old_rate;
-      if (vresampler->resampler().setup (ratio, n_channels, hlen) == 0)
-        {
-          return vresampler;
-        }
-      else
-        {
-          error ("audiowmark: resampling from old_rate=%d to new_rate=%d not implemented\n", old_rate, new_rate);
-          delete vresampler;
-          return nullptr;
-        }
-    }
-}
-
-/* generate a watermark at Params::mark_sample_rate and resample to whatever the original signal has
- *
- * input:  samples from original signal (always one frame)
- * output: watermark signal resampled to original signal sample rate
- */
-class WatermarkResampler
-{
-  std::unique_ptr<ResamplerImpl> in_resampler;
-  std::unique_ptr<ResamplerImpl> out_resampler;
-  WatermarkGen                   wm_gen;
-  bool                           need_resampler = false;
-public:
-  WatermarkResampler (int n_channels, int input_rate, vector<int>& bitvec_a, vector<int>& bitvec_b) :
-    wm_gen (n_channels , bitvec_a, bitvec_b)
-  {
-    need_resampler = (input_rate != Params::mark_sample_rate);
-
-    if (need_resampler)
-      {
-        in_resampler.reset (create_resampler (n_channels, input_rate, Params::mark_sample_rate));
-        out_resampler.reset (create_resampler (n_channels, Params::mark_sample_rate, input_rate));
-      }
-  }
-  bool
-  init_ok()
-  {
-    if (need_resampler)
-      return (in_resampler && out_resampler);
-    else
-      return true;
-  }
-  vector<float>
-  run (const vector<float>& samples)
-  {
-    if (!need_resampler)
-      {
-        /* cheap case: if no resampling is necessary, just generate the watermark signal */
-        return wm_gen.run (samples);
-      }
-
-    /* resample to the watermark sample rate */
-    in_resampler->write_frames (samples);
-    while (in_resampler->can_read_frames() >= Params::frame_size)
-      {
-        vector<float> r_samples = in_resampler->read_frames (Params::frame_size);
-
-        /* generate watermark at normalized sample rate */
-        vector<float> wm_samples = wm_gen.run (r_samples);
-
-        /* resample back to the original sample rate of the audio file */
-        out_resampler->write_frames (wm_samples);
-      }
-
-    size_t to_read = out_resampler->can_read_frames();
-    return out_resampler->read_frames (to_read);
-  }
-};
-
-int
-add_watermark (const string& infile, const string& outfile, const string& bits)
-{
-  auto bitvec = bit_str_to_vec (bits);
-  if (bitvec.empty())
-    {
-      fprintf (stderr, "audiowmark: cannot parse bits %s\n", bits.c_str());
-      return 1;
-    }
-  if (bitvec.size() > Params::payload_size)
-    {
-      fprintf (stderr, "audiowmark: number of bits in message '%s' larger than payload size\n", bits.c_str());
-      return 1;
-    }
-  if (bitvec.size() < Params::payload_size)
-    {
-      /* expand message automatically; good for testing, maybe not so good for the final product */
-      vector<int> expanded_bitvec;
-      for (size_t i = 0; i < Params::payload_size; i++)
-        expanded_bitvec.push_back (bitvec[i % bitvec.size()]);
-      bitvec = expanded_bitvec;
-    }
-  info ("Input:        %s\n", infile.c_str());
-  info ("Output:       %s\n", outfile.c_str());
-  info ("Message:      %s\n", bit_vec_to_str (bitvec).c_str());
-  info ("Strength:     %.6g\n\n", Params::water_delta * 1000);
-
-  /* add forward error correction, bitvec will now be a lot larger */
-  auto bitvec_a = randomize_bit_order (conv_encode (ConvBlockType::a, bitvec), /* encode */ true);
-  auto bitvec_b = randomize_bit_order (conv_encode (ConvBlockType::b, bitvec), /* encode */ true);
-
-  std::unique_ptr<AudioInputStream> in_stream; // FIXME: virtual constructor
-  if (Params::input_format == Format::AUTO)
-    {
-      SFInputStream *sistream = new SFInputStream();
-      in_stream.reset (sistream);
-      Error err = sistream->open (infile);
-      if (err && mp3_detect (infile))
-        {
-          MP3InputStream *mistream = new MP3InputStream();
-          in_stream.reset (mistream);
-
-          err = mistream->open (infile);
-          if (err)
-            {
-              fprintf (stderr, "audiowmark: error opening mp3 %s: %s\n", infile.c_str(), err.message());
-              return 1;
-            }
-        }
-      else if (err)
-        {
-          fprintf (stderr, "audiowmark: error opening %s: %s\n", infile.c_str(), err.message());
-          return 1;
-        }
-    }
-  else
-    {
-      RawInputStream *ristream = new RawInputStream();
-      in_stream.reset (ristream);
-      Error err = ristream->open (infile, Params::raw_input_format);
-      if (err)
-        {
-          fprintf (stderr, "audiowmark: error opening %s: %s\n", infile.c_str(), err.message());
-          return 1;
-        }
-    }
-  if (in_stream->n_frames() == AudioInputStream::N_FRAMES_UNKNOWN)
-    {
-      info ("Time:         unknown\n");
-    }
-  else
-    {
-      int orig_seconds = in_stream->n_frames() / in_stream->sample_rate();
-      info ("Time:         %d:%02d\n", orig_seconds / 60, orig_seconds % 60);
-    }
-  info ("Sample Rate:  %d\n", in_stream->sample_rate());
-  info ("Channels:     %d\n", in_stream->n_channels());
-
-  const int out_bit_depth = in_stream->bit_depth() > 16 ? 24 : 16;
-  std::unique_ptr<AudioOutputStream> out_stream;
-  if (Params::output_format == Format::RAW)
-    {
-      RawOutputStream *rostream = new RawOutputStream();
-      out_stream.reset (rostream);
-      Error err = rostream->open (outfile, Params::raw_output_format);
-      if (err)
-        {
-          fprintf (stderr, "audiowmark: error opening %s: %s\n", outfile.c_str(), err.message());
-          return 1;
-        }
-    }
-  else if (outfile == "-")
-    {
-      StdoutWavOutputStream *swstream = new StdoutWavOutputStream();
-      out_stream.reset (swstream);
-      Error err = swstream->open (in_stream->n_channels(), in_stream->sample_rate(), out_bit_depth, in_stream->n_frames());
-      if (err)
-        {
-          error ("audiowmark: error writing to -: %s\n", err.message());
-          return 1;
-        }
-    }
-  else
-    {
-      SFOutputStream *sfostream = new SFOutputStream();
-      out_stream.reset (sfostream);
-      Error err = sfostream->open (outfile, in_stream->n_channels(), in_stream->sample_rate(), out_bit_depth, in_stream->n_frames());
-      if (err)
-        {
-          error ("audiowmark: error writing to %s: %s\n", outfile.c_str(), err.message());
-          return 1;
-        }
-    }
-  vector<float> samples;
-
-  const int n_channels = in_stream->n_channels();
-  AudioBuffer audio_buffer (n_channels);
-  WatermarkResampler wm_resampler (n_channels, in_stream->sample_rate(), bitvec_a, bitvec_b);
-  if (!wm_resampler.init_ok())
-    return 1;
-
-  size_t total_input_frames = 0;
-  size_t total_output_frames = 0;
-  while (true)
-    {
-      Error err = Error::Code::NONE;
-
-      err = in_stream->read_frames (samples, Params::frame_size);
-      if (err)
-        {
-          error ("audiowmark: input stream read failed: %s\n", err.message());
-          return 1;
-        }
-      total_input_frames += samples.size() / n_channels;
-
-      if (samples.size() < Params::frame_size * n_channels)
-        {
-          if (total_input_frames == total_output_frames)
-            break;
-
-          /* zero sample padding after the actual input */
-          samples.resize (Params::frame_size * n_channels);
-        }
-      audio_buffer.write_frames (samples);
-      samples = wm_resampler.run (samples);
-      size_t to_read = samples.size() / n_channels;
-      vector<float> orig_samples  = audio_buffer.read_frames (to_read);
-      assert (samples.size() == orig_samples.size());
-
-      for (size_t i = 0; i < samples.size(); i++)
-        samples[i] += orig_samples[i];
-
-      size_t max_write_frames = total_input_frames - total_output_frames;
-      if (samples.size() > max_write_frames * n_channels)
-        samples.resize (max_write_frames * n_channels);
-
-      err = out_stream->write_frames (samples);
-      if (err)
-        {
-          error ("audiowmark output write failed: %s\n", err.message());
-          return 1;
-        }
-      total_output_frames += samples.size() / n_channels;
-    }
-  fprintf (stderr, "total output: %zd, expected: %zd\n", total_output_frames, in_stream->n_frames());
-#if 0
-  if (Params::snr)
-    {
-      /* compute/show signal to noise ratio */
-      double delta_power = 0;
-      double signal_power = 0;
-      for (size_t i = 0; i < samples.size(); i++)
-        {
-          const double orig_scaled = samples[i];      // original sample
-          const double delta       = out_signal[i];   // watermark
-
-          delta_power += delta * delta;
-          signal_power += orig_scaled * orig_scaled;
-        }
-      delta_power /= samples.size();
-      signal_power /= samples.size();
-
-      printf ("SNR:          %f dB\n", 10 * log10 (signal_power / delta_power));
-    }
-  float max_value = 1e-6;
-  for (size_t i = 0; i < samples.size(); i++)
-    {
-      /* Typically the original samples are already in range [-1;1]. However in
-       * some cases (mp3 loader), the samples are not fully normalized; in those
-       * cases, for volume normalization we treat them as-if they had been
-       * clipped already; final clipping will be done while saving.
-       */
-      const float x = bound<float> (-1, samples[i], 1);
-      const float value = fabsf (x + out_signal[i]);
-      if (value > max_value)
-        max_value = value;
-    }
-
-  // scale (samples + watermark) down if necessary to avoid clipping
-  const float scale = min (1.0 / max_value, 1.0);
-  for (size_t i = 0; i < samples.size(); i++)
-    samples[i] = (samples[i] + out_signal[i]) * scale;
-
-  printf ("Data Blocks:  %d\n", data_blocks);
-  printf ("Volume Norm:  %.3f (%.2f dB)\n", scale, db_from_factor (scale, -96));
-#endif
-  return 0;
-}
-
-vector<float>
-normalize_soft_bits (const vector<float>& soft_bits)
-{
-  vector<float> norm_soft_bits;
-
-  /* soft decoding produces better error correction than hard decoding */
-  if (Params::hard)
-    {
-      for (auto value : soft_bits)
-        norm_soft_bits.push_back (value > 0 ? 1.0 : 0.0);
-    }
-  else
-    {
-      /* figure out average level of each bit */
-      double mean = 0;
-      for (auto value : soft_bits)
-        mean += fabs (value);
-      mean /= soft_bits.size();
-
-      /* rescale from [-mean,+mean] to [0.0,1.0] */
-      for (auto value : soft_bits)
-        norm_soft_bits.push_back (0.5 * (value / mean + 1));
-    }
-
-  return norm_soft_bits;
-}
-
-vector<float>
-mix_decode (vector<vector<complex<float>>>& fft_out, int n_channels)
-{
-  vector<float> raw_bit_vec;
-
-  const int frame_count = mark_data_frame_count();
-
-  vector<MixEntry> mix_entries = gen_mix_entries();
-
-  double umag = 0, dmag = 0;
-  for (int f = 0; f < frame_count; f++)
-    {
-      for (int ch = 0; ch < n_channels; ch++)
-        {
-          for (size_t frame_b = 0; frame_b < Params::bands_per_frame; frame_b++)
-            {
-              int b = f * Params::bands_per_frame + frame_b;
-              const double min_db = -96;
-
-              const size_t index = mix_entries[b].frame * n_channels + ch;
-              const int u = mix_entries[b].up;
-              const int d = mix_entries[b].down;
-
-              umag += db_from_factor (abs (fft_out[index][u]), min_db);
-              dmag += db_from_factor (abs (fft_out[index][d]), min_db);
-            }
-        }
-      if ((f % Params::frames_per_bit) == (Params::frames_per_bit - 1))
-        {
-          raw_bit_vec.push_back (umag - dmag);
-          umag = 0;
-          dmag = 0;
-        }
-    }
-  return raw_bit_vec;
-}
-
-vector<float>
-linear_decode (vector<vector<complex<float>>>& fft_out, int n_channels)
-{
-  UpDownGen     up_down_gen (Random::Stream::data_up_down);
-  vector<float> raw_bit_vec;
-
-  const int frame_count = mark_data_frame_count();
-
-  double umag = 0, dmag = 0;
-  for (int f = 0; f < frame_count; f++)
-    {
-      for (int ch = 0; ch < n_channels; ch++)
-        {
-          const size_t index = data_frame_pos (f) * n_channels + ch;
-          UpDownArray up, down;
-          up_down_gen.get (f, up, down);
-
-          const double min_db = -96;
-          for (auto u : up)
-            umag += db_from_factor (abs (fft_out[index][u]), min_db);
-
-          for (auto d : down)
-            dmag += db_from_factor (abs (fft_out[index][d]), min_db);
-        }
-      if ((f % Params::frames_per_bit) == (Params::frames_per_bit - 1))
-        {
-          raw_bit_vec.push_back (umag - dmag);
-          umag = 0;
-          dmag = 0;
-        }
-    }
-  return raw_bit_vec;
-}
-
-double
-normalize_sync_quality (double raw_quality)
-{
-  /* the quality for a good sync block depends on watermark strength
-   *
-   * this is just an approximation, but it should be good enough to be able to
-   * use one single threshold on the normalized value check if we have a sync
-   * block or not - typical output is 1.0 or more for sync blocks and close
-   * to 0.0 for non-sync blocks
-   */
-  return raw_quality / min (Params::water_delta, 0.080) / 2.9;
-}
-
-class SyncFinder
-{
-  vector<vector<int>> up;
-  vector<vector<int>> down;
-
-  void
-  init_up_down (const WavData& wav_data)
-  {
-    up.clear();
-    down.clear();
-
-    up.resize (Params::sync_bits);
-    down.resize (Params::sync_bits);
-
-    UpDownGen up_down_gen (Random::Stream::sync_up_down);
-    size_t n_bands = Params::max_band - Params::min_band + 1;
-    for (int bit = 0; bit < Params::sync_bits; bit++)
-      {
-        for (int f = 0; f < Params::sync_frames_per_bit; f++)
-          {
-            UpDownArray frame_up, frame_down;
-            up_down_gen.get (f + bit * Params::sync_frames_per_bit, frame_up, frame_down);
-
-            for (auto u : frame_up)
-              up[bit].push_back (u - Params::min_band + sync_frame_pos (f + bit * Params::sync_frames_per_bit) * n_bands * wav_data.n_channels());
-
-            for (auto d : frame_down)
-              down[bit].push_back (d - Params::min_band + sync_frame_pos (f + bit * Params::sync_frames_per_bit) * n_bands * wav_data.n_channels());
-          }
-        sort (up[bit].begin(), up[bit].end());
-        sort (down[bit].begin(), down[bit].end());
-      }
-  }
-  double
-  sync_decode (const WavData& wav_data, const size_t start_frame, const vector<float>& fft_out_db, ConvBlockType *block_type)
-  {
-    double sync_quality = 0;
-
-    size_t n_bands = Params::max_band - Params::min_band + 1;
-    for (int bit = 0; bit < Params::sync_bits; bit++)
-      {
-        float umag = 0, dmag = 0;
-
-        for (int ch = 0; ch < wav_data.n_channels(); ch++)
-          {
-            const int index = (start_frame * wav_data.n_channels() + ch) * n_bands;
-
-            for (size_t i = 0; i < up[bit].size(); i++)
-              {
-                umag += fft_out_db[index + up[bit][i]];
-                dmag += fft_out_db[index + down[bit][i]];
-              }
-          }
-        /* convert avoiding bias, raw_bit < 0 => 0 bit received; raw_bit > 0 => 1 bit received */
-        double raw_bit;
-        if (umag < dmag)
-          {
-            raw_bit = 1 - umag / dmag;
-          }
-        else
-          {
-            raw_bit = dmag / umag - 1;
-          }
-
-        const int expect_data_bit = bit & 1; /* expect 010101 */
-        const double q = expect_data_bit ? raw_bit : -raw_bit;
-        sync_quality += q;
-      }
-    sync_quality /= Params::sync_bits;
-    sync_quality = normalize_sync_quality (sync_quality);
-
-    if (sync_quality < 0)
-      {
-        *block_type = ConvBlockType::b;
-        return -sync_quality;
-      }
-    else
-      {
-        *block_type = ConvBlockType::a;
-        return sync_quality;
-      }
-  }
-public:
-  struct Score {
-    size_t        index;
-    double        quality;
-    ConvBlockType block_type;
-  };
-  vector<Score>
-  search (const WavData& wav_data)
-  {
-    vector<Score> result_scores;
-    vector<Score> sync_scores;
-
-    if (Params::test_no_sync)
-      {
-        const size_t expect0 = Params::frames_pad_start * Params::frame_size;
-        const size_t expect_step = (mark_sync_frame_count() + mark_data_frame_count()) * Params::frame_size;
-        const size_t expect_end = frame_count (wav_data) * Params::frame_size;
-
-        int ab = 0;
-        for (size_t expect_index = expect0; expect_index + expect_step < expect_end; expect_index += expect_step)
-          result_scores.push_back (Score { expect_index, 1.0, (ab++ & 1) ? ConvBlockType::b : ConvBlockType::a });
-
-        return result_scores;
-      }
-    init_up_down (wav_data);
-
-    vector<float> fft_db;
-
-    // compute multiple time-shifted fft vectors
-    size_t n_bands = Params::max_band - Params::min_band + 1;
-    for (size_t sync_shift = 0; sync_shift < Params::frame_size; sync_shift += Params::sync_search_step)
-      {
-        sync_fft (wav_data, sync_shift, frame_count (wav_data) - 1, fft_db, /* want all frames */ {});
-        for (int start_frame = 0; start_frame < frame_count (wav_data); start_frame++)
-          {
-            const size_t sync_index = start_frame * Params::frame_size + sync_shift;
-            if ((start_frame + mark_sync_frame_count() + mark_data_frame_count()) * wav_data.n_channels() * n_bands < fft_db.size())
-              {
-                ConvBlockType block_type;
-                double quality = sync_decode (wav_data, start_frame, fft_db, &block_type);
-                // printf ("%zd %f\n", sync_index, quality);
-                sync_scores.emplace_back (Score { sync_index, quality, block_type });
-              }
-          }
-      }
-    sort (sync_scores.begin(), sync_scores.end(), [] (const Score& a, const Score &b) { return a.index < b.index; });
-
-    vector<int> want_frames (mark_sync_frame_count() + mark_data_frame_count());
-    for (size_t f = 0; f < mark_sync_frame_count(); f++)
-      want_frames[sync_frame_pos (f)] = 1;
-
-    /* for strength 8 and above:
-     *   -> more false positive candidates are rejected, so we can use a lower threshold
-     *
-     * for strength 7 and below:
-     *   -> we need a higher threshold, because otherwise watermark detection takes too long
-     */
-    const double strength = Params::water_delta * 1000;
-    const double sync_threshold1 = strength > 7.5 ? 0.4 : 0.5;
-
-    for (size_t i = 0; i < sync_scores.size(); i++)
-      {
-        // printf ("%zd %f\n", sync_scores[i].index, sync_scores[i].quality);
-        if (sync_scores[i].quality > sync_threshold1)
-          {
-            double q_last = -1;
-            double q_next = -1;
-
-            if (i > 0)
-              q_last = sync_scores[i - 1].quality;
-
-            if (i + 1 < sync_scores.size())
-              q_next = sync_scores[i + 1].quality;
-
-            if (sync_scores[i].quality > q_last && sync_scores[i].quality > q_next)
-              {
-                //printf ("%zd %s %f", sync_scores[i].index, find_closest_sync (sync_scores[i].index), sync_scores[i].quality);
-
-                // refine match
-                double best_quality       = sync_scores[i].quality;
-                size_t best_index         = sync_scores[i].index;
-                ConvBlockType best_block_type = sync_scores[i].block_type; /* doesn't really change during refinement */
-
-                int start = std::max (int (sync_scores[i].index) - Params::sync_search_step, 0);
-                int end   = sync_scores[i].index + Params::sync_search_step;
-                for (int fine_index = start; fine_index <= end; fine_index += Params::sync_search_fine)
-                  {
-                    sync_fft (wav_data, fine_index, mark_sync_frame_count() + mark_data_frame_count(), fft_db, want_frames);
-                    if (fft_db.size())
-                      {
-                        ConvBlockType block_type;
-                        double        q = sync_decode (wav_data, 0, fft_db, &block_type);
-
-                        if (q > best_quality)
-                          {
-                            best_quality = q;
-                            best_index   = fine_index;
-                          }
-                      }
-                  }
-                //printf (" => refined: %zd %s %f\n", best_index, find_closest_sync (best_index), best_quality);
-                if (best_quality > Params::sync_threshold2)
-                  result_scores.push_back (Score { best_index, best_quality, best_block_type });
-              }
-          }
-      }
-    return result_scores;
-  }
-private:
-  void
-  sync_fft (const WavData& wav_data, size_t index, size_t frame_count, vector<float>& fft_out_db, const vector<int>& want_frames)
-  {
-    fft_out_db.clear();
-
-    /* read past end? -> fail */
-    if (wav_data.n_values() < (index + frame_count * Params::frame_size) * wav_data.n_channels())
-      return;
-
-    FFTAnalyzer fft_analyzer (wav_data.n_channels());
-    const vector<float>& samples = wav_data.samples();
-    const size_t n_bands = Params::max_band - Params::min_band + 1;
-    int out_pos = 0;
-
-    fft_out_db.resize (wav_data.n_channels() * n_bands * frame_count);
-
-    for (size_t f = 0; f < frame_count; f++)
-      {
-        const double min_db = -96;
-        if (want_frames.size() && !want_frames[f])
-          {
-            for (int ch = 0; ch < wav_data.n_channels(); ch++)
-              for (int i = Params::min_band; i <= Params::max_band; i++)
-                fft_out_db[out_pos++] = min_db;
-          }
-        else
-          {
-            vector<vector<complex<float>>> frame_result = fft_analyzer.run_fft (samples, index + f * Params::frame_size);
-
-            /* computing db-magnitude is expensive, so we better do it here */
-            for (int ch = 0; ch < wav_data.n_channels(); ch++)
-              for (int i = Params::min_band; i <= Params::max_band; i++)
-                fft_out_db[out_pos++] = db_from_factor (abs (frame_result[ch][i]), min_db);
-          }
-      }
-  }
-
-  const char*
-  find_closest_sync (size_t index)
-  {
-    int best_error = 0xffff;
-    int best = 0;
-
-    for (int i = 0; i < 100; i++)
-      {
-        int error = abs (int (index) - int (i * Params::sync_bits * Params::sync_frames_per_bit * Params::frame_size));
-        if (error < best_error)
-          {
-            best = i;
-            best_error = error;
-          }
-      }
-    static char buffer[1024]; // this code is for debugging only, so this should be ok
-    sprintf (buffer, "n:%d offset:%d", best, int (index) - int (best * Params::sync_bits * Params::sync_frames_per_bit * Params::frame_size));
-    return buffer;
-  }
-};
-
-int
-decode_and_report (const WavData& wav_data, const string& orig_pattern)
-{
-  int match_count = 0, total_count = 0, sync_match = 0;
-
-  SyncFinder                sync_finder;
-  vector<SyncFinder::Score> sync_scores = sync_finder.search (wav_data);
-
-  auto report_pattern = [&] (SyncFinder::Score sync_score, const vector<int>& bit_vec, float decode_error)
-  {
-    if (sync_score.index)
-      {
-        const char *block_str = nullptr;
-
-        switch (sync_score.block_type)
-          {
-            case ConvBlockType::a:  block_str = "A";
-                                    break;
-            case ConvBlockType::b:  block_str = "B";
-                                    break;
-            case ConvBlockType::ab: block_str = "AB";
-                                    break;
-          }
-        const int seconds = sync_score.index / wav_data.sample_rate();
-        printf ("pattern %2d:%02d %s %.3f %.3f %s\n", seconds / 60, seconds % 60, bit_vec_to_str (bit_vec).c_str(),
-                sync_score.quality, decode_error, block_str);
-      }
-    else /* this is the combined pattern "all" */
-      {
-        printf ("pattern   all %s %.3f %.3f\n", bit_vec_to_str (bit_vec).c_str(), sync_score.quality, decode_error);
-      }
-    if (!orig_pattern.empty())
-      {
-        bool        match = true;
-        vector<int> orig_vec = bit_str_to_vec (orig_pattern);
-
-        for (size_t i = 0; i < bit_vec.size(); i++)
-          match = match && (bit_vec[i] == orig_vec[i % orig_vec.size()]);
-
-        if (match)
-          match_count++;
-
-      }
-    total_count++;
-  };
-
-  vector<float> raw_bit_vec_all (conv_code_size (ConvBlockType::ab, Params::payload_size));
-  vector<int>   raw_bit_vec_norm (2);
-
-  SyncFinder::Score score_all { 0, 0 };
-  SyncFinder::Score score_ab  { 0, 0, ConvBlockType::ab };
-
-  ConvBlockType last_block_type = ConvBlockType::b;
-  vector<vector<float>> ab_raw_bit_vec (2);
-  vector<float>         ab_quality (2);
-  FFTAnalyzer           fft_analyzer (wav_data.n_channels());
-  for (auto sync_score : sync_scores)
-    {
-      const size_t count = mark_sync_frame_count() + mark_data_frame_count();
-      const size_t index = sync_score.index;
-      const int    ab = (sync_score.block_type == ConvBlockType::b); /* A -> 0, B -> 1 */
-
-      auto fft_range_out = fft_analyzer.fft_range (wav_data.samples(), index, count);
-      if (fft_range_out.size())
-        {
-          /* ---- retrieve bits from watermark ---- */
-          vector<float> raw_bit_vec;
-          if (Params::mix)
-            {
-              raw_bit_vec = mix_decode (fft_range_out, wav_data.n_channels());
-            }
-          else
-            {
-              raw_bit_vec = linear_decode (fft_range_out, wav_data.n_channels());
-            }
-          assert (raw_bit_vec.size() == conv_code_size (ConvBlockType::a, Params::payload_size));
-
-          raw_bit_vec = randomize_bit_order (raw_bit_vec, /* encode */ false);
-
-          /* ---- deal with this pattern ---- */
-          float decode_error = 0;
-          vector<int> bit_vec = conv_decode_soft (sync_score.block_type, normalize_soft_bits (raw_bit_vec), &decode_error);
-
-          report_pattern (sync_score, bit_vec, decode_error);
-
-          /* ---- update "all" pattern ---- */
-          score_all.quality += sync_score.quality;
-
-          for (size_t i = 0; i < raw_bit_vec.size(); i++)
-            {
-              raw_bit_vec_all[i * 2 + ab] += raw_bit_vec[i];
-            }
-          raw_bit_vec_norm[ab]++;
-
-          /* ---- if last block was A & this block is B => deal with combined AB block */
-          ab_raw_bit_vec[ab] = raw_bit_vec;
-          ab_quality[ab]     = sync_score.quality;
-          if (last_block_type == ConvBlockType::a && sync_score.block_type == ConvBlockType::b)
-            {
-              /* join A and B block -> AB block */
-              vector<float> ab_bits (raw_bit_vec.size() * 2);
-              for (size_t i = 0; i <  raw_bit_vec.size(); i++)
-                {
-                  ab_bits[i * 2] = ab_raw_bit_vec[0][i];
-                  ab_bits[i * 2 + 1] = ab_raw_bit_vec[1][i];
-                }
-              vector<int> bit_vec = conv_decode_soft (ConvBlockType::ab, normalize_soft_bits (ab_bits), &decode_error);
-              score_ab.index = sync_score.index;
-              score_ab.quality = (ab_quality[0] + ab_quality[1]) / 2;
-              report_pattern (score_ab, bit_vec, decode_error);
-            }
-          last_block_type = sync_score.block_type;
-        }
-    }
-  if (total_count > 1) /* all pattern: average soft bits of all watermarks and decode */
-    {
-      for (size_t i = 0; i < raw_bit_vec_all.size(); i += 2)
-        {
-          raw_bit_vec_all[i]     /= max (raw_bit_vec_norm[0], 1); /* normalize A soft bits with number of A blocks */
-          raw_bit_vec_all[i + 1] /= max (raw_bit_vec_norm[1], 1); /* normalize B soft bits with number of B blocks */
-        }
-      score_all.quality /= raw_bit_vec_norm[0] + raw_bit_vec_norm[1];
-
-      vector<float> soft_bit_vec = normalize_soft_bits (raw_bit_vec_all);
-
-      float decode_error = 0;
-      vector<int> bit_vec = conv_decode_soft (ConvBlockType::ab, soft_bit_vec, &decode_error);
-
-      report_pattern (score_all, bit_vec, decode_error);
-    }
-
-  if (!orig_pattern.empty())
-    {
-      printf ("match_count %d %d\n", match_count, total_count);
-
-      /* search sync markers at typical positions */
-      const int expect0 = Params::frames_pad_start * Params::frame_size;
-      const int expect_step = (mark_sync_frame_count() + mark_data_frame_count()) * Params::frame_size;
-      const int expect_end = frame_count (wav_data) * Params::frame_size;
-
-      for (int expect_index = expect0; expect_index + expect_step < expect_end; expect_index += expect_step)
-        {
-          for (auto sync_score : sync_scores)
-            {
-              if (abs (int (sync_score.index + Params::test_cut) - expect_index) < Params::frame_size / 2)
-                {
-                  sync_match++;
-                  break;
-                }
-            }
-        }
-      printf ("sync_match %d %zd\n", sync_match, sync_scores.size());
-    }
-  return 0;
-}
-
-int
-get_watermark (const string& infile, const string& orig_pattern)
-{
-  WavData wav_data;
-  Error err = wav_data.load (infile);
-  if (err)
-    {
-      fprintf (stderr, "audiowmark: error loading %s: %s\n", infile.c_str(), err.message());
-      return 1;
-    }
-
-  if (Params::test_truncate)
-    {
-      const size_t  want_n_samples = wav_data.sample_rate() * wav_data.n_channels() * Params::test_truncate;
-      vector<float> short_samples  = wav_data.samples();
-
-      if (want_n_samples < short_samples.size())
-        {
-          short_samples.resize (want_n_samples);
-          wav_data.set_samples (short_samples);
-        }
-    }
-  if (wav_data.sample_rate() == Params::mark_sample_rate)
-    {
-      return decode_and_report (wav_data, orig_pattern);
-    }
-  else
-    {
-      return decode_and_report (resample (wav_data, Params::mark_sample_rate), orig_pattern);
-    }
-}
-
 int
 gentest (const string& infile, const string& outfile)
 {

--- a/src/wmadd.cc
+++ b/src/wmadd.cc
+#include <stdint.h>
+
+#include <zita-resampler/resampler.h>
+#include <zita-resampler/vresampler.h>
+
+#include "wmcommon.hh"
+#include "fft.hh"
+#include "convcode.hh"
+#include "sfinputstream.hh"
+#include "sfoutputstream.hh"
+#include "mp3inputstream.hh"
+#include "rawinputstream.hh"
+#include "rawoutputstream.hh"
+#include "stdoutwavoutputstream.hh"
+#include "mp3.hh"
+
+using std::string;
+using std::vector;
+using std::complex;
+
+enum class FrameMod : uint8_t {
+  KEEP = 0,
+  UP,
+  DOWN
+};
+
+static void
+prepare_frame_mod (UpDownGen& up_down_gen, int f, vector<FrameMod>& frame_mod, int data_bit)
+{
+  UpDownArray up, down;
+  up_down_gen.get (f, up, down);
+  for (auto u : up)
+    frame_mod[u] = data_bit ? FrameMod::UP : FrameMod::DOWN;
+
+  for (auto d : down)
+    frame_mod[d] = data_bit ? FrameMod::DOWN : FrameMod::UP;
+}
+
+static void
+apply_frame_mod (const vector<FrameMod>& frame_mod, const vector<complex<float>>& fft_out, vector<complex<float>>& fft_delta_spect)
+{
+  const float   min_mag = 1e-7;   // avoid computing pow (0.0, -water_delta) which would be inf
+  for (size_t i = 0; i < frame_mod.size(); i++)
+    {
+      if (frame_mod[i] == FrameMod::KEEP)
+        continue;
+
+      int data_bit_sign = (frame_mod[i] == FrameMod::UP) ? 1 : -1;
+      /*
+       * for up bands, we want do use [for a 1 bit]  (pow (mag, 1 - water_delta))
+       *
+       * this actually increases the amount of energy because mag is less than 1.0
+       */
+      const float mag = abs (fft_out[i]);
+      if (mag > min_mag)
+        {
+          const float mag_factor = powf (mag, -Params::water_delta * data_bit_sign);
+
+          fft_delta_spect[i] = fft_out[i] * (mag_factor - 1);
+        }
+    }
+}
+
+static void
+mark_data (vector<vector<FrameMod>>& frame_mod, const vector<int>& bitvec)
+{
+  assert (bitvec.size() == mark_data_frame_count() / Params::frames_per_bit);
+  assert (frame_mod.size() >= mark_data_frame_count());
+
+  const int frame_count = mark_data_frame_count();
+
+  if (Params::mix)
+    {
+      vector<MixEntry> mix_entries = gen_mix_entries();
+
+      for (int f = 0; f < frame_count; f++)
+        {
+          for (size_t frame_b = 0; frame_b < Params::bands_per_frame; frame_b++)
+            {
+              int b = f * Params::bands_per_frame + frame_b;
+
+              const int data_bit = bitvec[f / Params::frames_per_bit];
+
+              const int u = mix_entries[b].up;
+              const int d = mix_entries[b].down;
+              const int index = mix_entries[b].frame;
+
+              frame_mod[index][u] = data_bit ? FrameMod::UP : FrameMod::DOWN;
+              frame_mod[index][d] = data_bit ? FrameMod::DOWN : FrameMod::UP;
+            }
+        }
+    }
+  else
+    {
+      UpDownGen up_down_gen (Random::Stream::data_up_down);
+
+      for (int f = 0; f < frame_count; f++)
+        {
+          size_t index = data_frame_pos (f);
+
+          prepare_frame_mod (up_down_gen, f, frame_mod[index], bitvec[f / Params::frames_per_bit]);
+        }
+    }
+}
+
+static void
+mark_sync (vector<vector<FrameMod>>& frame_mod, int ab)
+{
+  const int frame_count = mark_sync_frame_count();
+  assert (frame_mod.size() >= mark_sync_frame_count());
+
+  UpDownGen up_down_gen (Random::Stream::sync_up_down);
+
+  // sync block always written in linear order (no mix)
+  for (int f = 0; f < frame_count; f++)
+    {
+      size_t index = sync_frame_pos (f);
+      int    data_bit = (f / Params::sync_frames_per_bit + ab) & 1; /* write 010101 for a block, 101010 for b block */
+
+      prepare_frame_mod (up_down_gen, f, frame_mod[index], data_bit);
+    }
+}
+
+static void
+init_pad_mod_vec (vector<vector<FrameMod>>& pad_mod_vec)
+{
+  UpDownGen up_down_gen (Random::Stream::pad_up_down);
+
+  for (size_t f = 0; f < Params::frames_pad_start; f++)
+    {
+      vector<FrameMod> mod (Params::max_band + 1);
+
+      prepare_frame_mod (up_down_gen, f, mod, 0);
+      pad_mod_vec.push_back (mod);
+    }
+}
+
+static void
+init_frame_mod_vec (vector<vector<FrameMod>>& frame_mod_vec, int ab, const vector<int>& bitvec)
+{
+  frame_mod_vec.resize (mark_sync_frame_count() + mark_data_frame_count());
+
+  for (auto& frame_mod : frame_mod_vec)
+    frame_mod.resize (Params::max_band + 1);
+
+  mark_sync (frame_mod_vec, ab);
+  mark_data (frame_mod_vec, bitvec);
+}
+
+/* synthesizes a watermark stream (overlap add with synthesis window)
+ *
+ * input:  per-channel fft delta values (always one frame)
+ * output: samples
+ */
+class WatermarkSynth
+{
+  const int     n_channels = 0;
+  vector<float> window;
+  vector<float> synth_samples;
+  bool          first_frame = true;
+
+  void
+  generate_window()
+  {
+    window.resize (Params::frame_size * 3);
+    for (size_t i = 0; i < window.size(); i++)
+      {
+        const double overlap = 0.1;
+
+        // triangular basic window
+        double tri;
+        double norm_pos = (double (i) - Params::frame_size) / Params::frame_size;
+
+        if (norm_pos > 0.5) /* symmetric window */
+          norm_pos = 1 - norm_pos;
+        if (norm_pos < -overlap)
+          {
+            tri = 0;
+          }
+        else if (norm_pos < overlap)
+          {
+            tri = 0.5 + norm_pos / (2 * overlap);
+          }
+        else
+          {
+            tri = 1;
+          }
+        // cosine
+        window[i] = (cos (tri*M_PI+M_PI)+1) * 0.5;
+      }
+  }
+public:
+  WatermarkSynth (int n_channels) :
+    n_channels (n_channels)
+  {
+    generate_window();
+    synth_samples.resize (window.size() * n_channels);
+  }
+  vector<float>
+  run (const vector<vector<complex<float>>>& fft_delta_spect)
+  {
+    const size_t synth_frame_sz = Params::frame_size * n_channels;
+    /* move frame 1 and frame 2 to frame 0 and frame 1 */
+    std::copy (&synth_samples[synth_frame_sz], &synth_samples[synth_frame_sz * 3], &synth_samples[0]);
+    /* zero out frame 2 */
+    std::fill (&synth_samples[synth_frame_sz * 2], &synth_samples[synth_frame_sz * 3], 0);
+    for (int ch = 0; ch < n_channels; ch++)
+      {
+        /* mix watermark signal to output frame */
+        vector<float> fft_delta_out = ifft (fft_delta_spect[ch]);
+
+        for (int dframe = 0; dframe <= 2; dframe++)
+          {
+            const int wstart = dframe * Params::frame_size;
+
+            int pos = dframe * Params::frame_size * n_channels + ch;
+            for (size_t x = 0; x < Params::frame_size; x++)
+              {
+                synth_samples[pos] += fft_delta_out[x] * window[wstart + x];
+                pos += n_channels;
+              }
+          }
+      }
+    if (first_frame)
+      {
+        first_frame = false;
+        return {};
+      }
+    else
+      {
+        vector<float> out_samples (synth_samples.begin(), synth_samples.begin() + Params::frame_size * n_channels);
+        return out_samples;
+      }
+  }
+};
+
+/* generates a watermark signal
+ *
+ * input:  original signal samples (always for one complete frame)
+ * output: watermark signal (to be mixed to the original sample)
+ */
+class WatermarkGen
+{
+  enum State { PAD, WATERMARK } state = State::PAD;
+
+  const int                 n_channels = 0;
+  int                       frame_number = 0;
+  int                       frame_bound = Params::frames_pad_start;
+  int                       ab = 0;
+
+  FFTAnalyzer               fft_analyzer;
+  WatermarkSynth            wm_synth;
+
+  vector<int>               bitvec_a;
+  vector<int>               bitvec_b;
+  vector<vector<FrameMod>>  pad_mod_vec;
+  vector<vector<FrameMod>>  frame_mod_vec_a;
+  vector<vector<FrameMod>>  frame_mod_vec_b;
+public:
+  WatermarkGen (int n_channels, const vector<int>& bitvec_a, const vector<int>& bitvec_b) :
+    n_channels (n_channels),
+    fft_analyzer (n_channels),
+    wm_synth (n_channels),
+    bitvec_a (bitvec_a),
+    bitvec_b (bitvec_b)
+  {
+    init_pad_mod_vec (pad_mod_vec);
+    init_frame_mod_vec (frame_mod_vec_a, 0, bitvec_a);
+  }
+  vector<float>
+  run (const vector<float>& samples)
+  {
+    assert (samples.size() == Params::frame_size * n_channels);
+
+    vector<vector<complex<float>>> fft_out = fft_analyzer.run_fft (samples, 0);
+
+    vector<vector<complex<float>>> fft_delta_spect;
+    for (int ch = 0; ch < n_channels; ch++)
+      fft_delta_spect.push_back (vector<complex<float>> (fft_out.back().size()));
+
+    if (state == State::PAD)
+      {
+        for (int ch = 0; ch < n_channels; ch++)
+          apply_frame_mod (pad_mod_vec[frame_number], fft_out[ch], fft_delta_spect[ch]);
+      }
+    else if (state == State::WATERMARK)
+      {
+        for (int ch = 0; ch < n_channels; ch++)
+          apply_frame_mod (ab ? frame_mod_vec_b[frame_number] : frame_mod_vec_a[frame_number], fft_out[ch], fft_delta_spect[ch]);
+      }
+
+    frame_number++;
+    if (frame_number == frame_bound)
+      {
+        frame_number = 0;
+
+        if (state == PAD)
+          {
+            state = WATERMARK;
+            frame_bound = mark_sync_frame_count() + mark_data_frame_count();
+          }
+        else if (state == WATERMARK)
+          {
+            ab = (ab + 1) & 1; // write A|B|A|B|...
+            frame_bound = mark_sync_frame_count() + mark_data_frame_count();
+
+            if (frame_mod_vec_b.empty())
+              {
+                // we initialize this only when we need it to minimize startup latency
+                init_frame_mod_vec (frame_mod_vec_b, 1, bitvec_b);
+              }
+          }
+      }
+    return wm_synth.run (fft_delta_spect);
+  }
+};
+
+class AudioBuffer
+{
+  const int     n_channels = 0;
+  vector<float> buffer;
+
+public:
+  AudioBuffer (int n_channels) :
+    n_channels (n_channels)
+  {
+  }
+  void
+  write_frames (const vector<float>& samples)
+  {
+    buffer.insert (buffer.end(), samples.begin(), samples.end());
+  }
+  vector<float>
+  read_frames (size_t frames)
+  {
+    assert (frames * n_channels <= buffer.size());
+    const auto begin = buffer.begin();
+    const auto end   = begin + frames * n_channels;
+    vector<float> result (begin, end);
+    buffer.erase (begin, end);
+    return result;
+  }
+  size_t
+  can_read_frames() const
+  {
+    return buffer.size() / n_channels;
+  }
+};
+
+class ResamplerImpl
+{
+public:
+  virtual
+  ~ResamplerImpl()
+  {
+  }
+
+  virtual void          write_frames (const vector<float>& frames) = 0;
+  virtual vector<float> read_frames (size_t frames) = 0;
+  virtual size_t        can_read_frames() const = 0;
+};
+
+template<class Resampler>
+class BufferedResamplerImpl : public ResamplerImpl
+{
+  const int     n_channels = 0;
+  bool          first_write = true;
+  Resampler     m_resampler;
+
+  vector<float> buffer;
+public:
+  BufferedResamplerImpl (int n_channels) :
+    n_channels (n_channels)
+  {
+  }
+  Resampler&
+  resampler()
+  {
+    return m_resampler;
+  }
+  void
+  write_frames (const vector<float>& frames)
+  {
+    if (first_write)
+      {
+        /* avoid timeshift: zita needs k/2 - 1 samples before the actual input */
+        m_resampler.inp_count = m_resampler.inpsize () / 2 - 1;
+        m_resampler.inp_data  = nullptr;
+
+        m_resampler.out_count = 1000000; // <- just needs to be large enough that all input is consumed
+        m_resampler.out_data  = nullptr;
+        m_resampler.process();
+
+        first_write = false;
+      }
+
+    uint start = 0;
+    do
+      {
+        const int out_count = Params::frame_size;
+        float out[out_count * n_channels];
+
+        m_resampler.out_count = out_count;
+        m_resampler.out_data  = out;
+
+        m_resampler.inp_count = frames.size() / n_channels - start;
+        m_resampler.inp_data  = const_cast<float *> (&frames[start * n_channels]);
+        m_resampler.process();
+
+        size_t count = out_count - m_resampler.out_count;
+        buffer.insert (buffer.end(), out, out + count * n_channels);
+
+        start += frames.size() / n_channels - start - m_resampler.inp_count;
+      }
+    while (start != frames.size() / n_channels);
+  }
+  vector<float>
+  read_frames (size_t frames)
+  {
+    assert (frames * n_channels <= buffer.size());
+    const auto begin = buffer.begin();
+    const auto end   = begin + frames * n_channels;
+    vector<float> result (begin, end);
+    buffer.erase (begin, end);
+    return result;
+  }
+  size_t
+  can_read_frames() const
+  {
+    return buffer.size() / n_channels;
+  }
+};
+
+static ResamplerImpl *
+create_resampler (int n_channels, int old_rate, int new_rate)
+{
+  if (old_rate == new_rate)
+    {
+      return nullptr; // should not be using create_resampler for that case
+    }
+  else
+    {
+      /* zita-resampler provides two resampling algorithms
+       *
+       * a fast optimized version: Resampler
+       *   this is an optimized version, which works for many common cases,
+       *   like resampling between 22050, 32000, 44100, 48000, 96000 Hz
+       *
+       * a slower version: VResampler
+       *   this works for arbitary rates (like 33333 -> 44100 resampling)
+       *
+       * so we try using Resampler, and if that fails fall back to VResampler
+       */
+      const int hlen = 16;
+
+      auto resampler = new BufferedResamplerImpl<Resampler> (n_channels);
+      if (resampler->resampler().setup (old_rate, new_rate, n_channels, hlen) == 0)
+        {
+          return resampler;
+        }
+      else
+        delete resampler;
+
+      auto vresampler = new BufferedResamplerImpl<VResampler> (n_channels);
+      const double ratio = double (new_rate) / old_rate;
+      if (vresampler->resampler().setup (ratio, n_channels, hlen) == 0)
+        {
+          return vresampler;
+        }
+      else
+        {
+          error ("audiowmark: resampling from old_rate=%d to new_rate=%d not implemented\n", old_rate, new_rate);
+          delete vresampler;
+          return nullptr;
+        }
+    }
+}
+
+/* generate a watermark at Params::mark_sample_rate and resample to whatever the original signal has
+ *
+ * input:  samples from original signal (always one frame)
+ * output: watermark signal resampled to original signal sample rate
+ */
+class WatermarkResampler
+{
+  std::unique_ptr<ResamplerImpl> in_resampler;
+  std::unique_ptr<ResamplerImpl> out_resampler;
+  WatermarkGen                   wm_gen;
+  bool                           need_resampler = false;
+public:
+  WatermarkResampler (int n_channels, int input_rate, vector<int>& bitvec_a, vector<int>& bitvec_b) :
+    wm_gen (n_channels , bitvec_a, bitvec_b)
+  {
+    need_resampler = (input_rate != Params::mark_sample_rate);
+
+    if (need_resampler)
+      {
+        in_resampler.reset (create_resampler (n_channels, input_rate, Params::mark_sample_rate));
+        out_resampler.reset (create_resampler (n_channels, Params::mark_sample_rate, input_rate));
+      }
+  }
+  bool
+  init_ok()
+  {
+    if (need_resampler)
+      return (in_resampler && out_resampler);
+    else
+      return true;
+  }
+  vector<float>
+  run (const vector<float>& samples)
+  {
+    if (!need_resampler)
+      {
+        /* cheap case: if no resampling is necessary, just generate the watermark signal */
+        return wm_gen.run (samples);
+      }
+
+    /* resample to the watermark sample rate */
+    in_resampler->write_frames (samples);
+    while (in_resampler->can_read_frames() >= Params::frame_size)
+      {
+        vector<float> r_samples = in_resampler->read_frames (Params::frame_size);
+
+        /* generate watermark at normalized sample rate */
+        vector<float> wm_samples = wm_gen.run (r_samples);
+
+        /* resample back to the original sample rate of the audio file */
+        out_resampler->write_frames (wm_samples);
+      }
+
+    size_t to_read = out_resampler->can_read_frames();
+    return out_resampler->read_frames (to_read);
+  }
+};
+
+int
+add_watermark (const string& infile, const string& outfile, const string& bits)
+{
+  auto bitvec = bit_str_to_vec (bits);
+  if (bitvec.empty())
+    {
+      fprintf (stderr, "audiowmark: cannot parse bits %s\n", bits.c_str());
+      return 1;
+    }
+  if (bitvec.size() > Params::payload_size)
+    {
+      fprintf (stderr, "audiowmark: number of bits in message '%s' larger than payload size\n", bits.c_str());
+      return 1;
+    }
+  if (bitvec.size() < Params::payload_size)
+    {
+      /* expand message automatically; good for testing, maybe not so good for the final product */
+      vector<int> expanded_bitvec;
+      for (size_t i = 0; i < Params::payload_size; i++)
+        expanded_bitvec.push_back (bitvec[i % bitvec.size()]);
+      bitvec = expanded_bitvec;
+    }
+  info ("Input:        %s\n", infile.c_str());
+  info ("Output:       %s\n", outfile.c_str());
+  info ("Message:      %s\n", bit_vec_to_str (bitvec).c_str());
+  info ("Strength:     %.6g\n\n", Params::water_delta * 1000);
+
+  /* add forward error correction, bitvec will now be a lot larger */
+  auto bitvec_a = randomize_bit_order (conv_encode (ConvBlockType::a, bitvec), /* encode */ true);
+  auto bitvec_b = randomize_bit_order (conv_encode (ConvBlockType::b, bitvec), /* encode */ true);
+
+  std::unique_ptr<AudioInputStream> in_stream; // FIXME: virtual constructor
+  if (Params::input_format == Format::AUTO)
+    {
+      SFInputStream *sistream = new SFInputStream();
+      in_stream.reset (sistream);
+      Error err = sistream->open (infile);
+      if (err && mp3_detect (infile))
+        {
+          MP3InputStream *mistream = new MP3InputStream();
+          in_stream.reset (mistream);
+
+          err = mistream->open (infile);
+          if (err)
+            {
+              fprintf (stderr, "audiowmark: error opening mp3 %s: %s\n", infile.c_str(), err.message());
+              return 1;
+            }
+        }
+      else if (err)
+        {
+          fprintf (stderr, "audiowmark: error opening %s: %s\n", infile.c_str(), err.message());
+          return 1;
+        }
+    }
+  else
+    {
+      RawInputStream *ristream = new RawInputStream();
+      in_stream.reset (ristream);
+      Error err = ristream->open (infile, Params::raw_input_format);
+      if (err)
+        {
+          fprintf (stderr, "audiowmark: error opening %s: %s\n", infile.c_str(), err.message());
+          return 1;
+        }
+    }
+  if (in_stream->n_frames() == AudioInputStream::N_FRAMES_UNKNOWN)
+    {
+      info ("Time:         unknown\n");
+    }
+  else
+    {
+      int orig_seconds = in_stream->n_frames() / in_stream->sample_rate();
+      info ("Time:         %d:%02d\n", orig_seconds / 60, orig_seconds % 60);
+    }
+  info ("Sample Rate:  %d\n", in_stream->sample_rate());
+  info ("Channels:     %d\n", in_stream->n_channels());
+
+  const int out_bit_depth = in_stream->bit_depth() > 16 ? 24 : 16;
+  std::unique_ptr<AudioOutputStream> out_stream;
+  if (Params::output_format == Format::RAW)
+    {
+      RawOutputStream *rostream = new RawOutputStream();
+      out_stream.reset (rostream);
+      Error err = rostream->open (outfile, Params::raw_output_format);
+      if (err)
+        {
+          fprintf (stderr, "audiowmark: error opening %s: %s\n", outfile.c_str(), err.message());
+          return 1;
+        }
+    }
+  else if (outfile == "-")
+    {
+      StdoutWavOutputStream *swstream = new StdoutWavOutputStream();
+      out_stream.reset (swstream);
+      Error err = swstream->open (in_stream->n_channels(), in_stream->sample_rate(), out_bit_depth, in_stream->n_frames());
+      if (err)
+        {
+          error ("audiowmark: error writing to -: %s\n", err.message());
+          return 1;
+        }
+    }
+  else
+    {
+      SFOutputStream *sfostream = new SFOutputStream();
+      out_stream.reset (sfostream);
+      Error err = sfostream->open (outfile, in_stream->n_channels(), in_stream->sample_rate(), out_bit_depth, in_stream->n_frames());
+      if (err)
+        {
+          error ("audiowmark: error writing to %s: %s\n", outfile.c_str(), err.message());
+          return 1;
+        }
+    }
+  vector<float> samples;
+
+  const int n_channels = in_stream->n_channels();
+  AudioBuffer audio_buffer (n_channels);
+  WatermarkResampler wm_resampler (n_channels, in_stream->sample_rate(), bitvec_a, bitvec_b);
+  if (!wm_resampler.init_ok())
+    return 1;
+
+  size_t total_input_frames = 0;
+  size_t total_output_frames = 0;
+  while (true)
+    {
+      Error err = Error::Code::NONE;
+
+      err = in_stream->read_frames (samples, Params::frame_size);
+      if (err)
+        {
+          error ("audiowmark: input stream read failed: %s\n", err.message());
+          return 1;
+        }
+      total_input_frames += samples.size() / n_channels;
+
+      if (samples.size() < Params::frame_size * n_channels)
+        {
+          if (total_input_frames == total_output_frames)
+            break;
+
+          /* zero sample padding after the actual input */
+          samples.resize (Params::frame_size * n_channels);
+        }
+      audio_buffer.write_frames (samples);
+      samples = wm_resampler.run (samples);
+      size_t to_read = samples.size() / n_channels;
+      vector<float> orig_samples  = audio_buffer.read_frames (to_read);
+      assert (samples.size() == orig_samples.size());
+
+      for (size_t i = 0; i < samples.size(); i++)
+        samples[i] += orig_samples[i];
+
+      size_t max_write_frames = total_input_frames - total_output_frames;
+      if (samples.size() > max_write_frames * n_channels)
+        samples.resize (max_write_frames * n_channels);
+
+      err = out_stream->write_frames (samples);
+      if (err)
+        {
+          error ("audiowmark output write failed: %s\n", err.message());
+          return 1;
+        }
+      total_output_frames += samples.size() / n_channels;
+    }
+  fprintf (stderr, "total output: %zd, expected: %zd\n", total_output_frames, in_stream->n_frames());
+#if 0
+  if (Params::snr)
+    {
+      /* compute/show signal to noise ratio */
+      double delta_power = 0;
+      double signal_power = 0;
+      for (size_t i = 0; i < samples.size(); i++)
+        {
+          const double orig_scaled = samples[i];      // original sample
+          const double delta       = out_signal[i];   // watermark
+
+          delta_power += delta * delta;
+          signal_power += orig_scaled * orig_scaled;
+        }
+      delta_power /= samples.size();
+      signal_power /= samples.size();
+
+      printf ("SNR:          %f dB\n", 10 * log10 (signal_power / delta_power));
+    }
+  float max_value = 1e-6;
+  for (size_t i = 0; i < samples.size(); i++)
+    {
+      /* Typically the original samples are already in range [-1;1]. However in
+       * some cases (mp3 loader), the samples are not fully normalized; in those
+       * cases, for volume normalization we treat them as-if they had been
+       * clipped already; final clipping will be done while saving.
+       */
+      const float x = bound<float> (-1, samples[i], 1);
+      const float value = fabsf (x + out_signal[i]);
+      if (value > max_value)
+        max_value = value;
+    }
+
+  // scale (samples + watermark) down if necessary to avoid clipping
+  const float scale = min (1.0 / max_value, 1.0);
+  for (size_t i = 0; i < samples.size(); i++)
+    samples[i] = (samples[i] + out_signal[i]) * scale;
+
+  printf ("Data Blocks:  %d\n", data_blocks);
+  printf ("Volume Norm:  %.3f (%.2f dB)\n", scale, db_from_factor (scale, -96));
+#endif
+  return 0;
+}
+
+
--- a/src/wmcommon.cc
+++ b/src/wmcommon.cc
+#include "wmcommon.hh"
+#include "fft.hh"
+#include "convcode.hh"
+
+int    Params::frames_per_bit  = 2;
+double Params::water_delta     = 0.01;
+bool   Params::mix             = true;
+bool   Params::hard            = false; // hard decode bits? (soft decoding is better)
+bool   Params::snr             = false; // compute/show snr while adding watermark
+int    Params::have_key        = 0;
+int    Params::test_cut        = 0; // for sync test
+bool   Params::test_no_sync    = false; // disable sync
+int    Params::test_truncate   = 0;
+
+Format Params::input_format     = Format::AUTO;
+Format Params::output_format    = Format::AUTO;
+
+RawFormat Params::raw_input_format;
+RawFormat Params::raw_output_format;
+
+using std::vector;
+using std::complex;
+
+inline double
+window_cos (double x) /* von Hann window */
+{
+  if (fabs (x) > 1)
+    return 0;
+  return 0.5 * cos (x * M_PI) + 0.5;
+}
+
+inline double
+window_hamming (double x) /* sharp (rectangle) cutoffs at boundaries */
+{
+  if (fabs (x) > 1)
+    return 0;
+
+  return 0.54 + 0.46 * cos (M_PI * x);
+}
+
+/*
+ * glibc log2f is a lot faster than glibc log10
+ */
+inline double
+fast_log10 (double l)
+{
+  constexpr double log2_log10_factor = 0.3010299956639811952; // 1 / log2 (10)
+
+  return log2f (l) * log2_log10_factor;
+}
+
+double
+db_from_factor (double factor, double min_dB)
+{
+  if (factor > 0)
+    {
+      double dB = fast_log10 (factor); /* Bell */
+      dB *= 20;
+      return dB;
+    }
+  else
+    return min_dB;
+}
+
+FFTAnalyzer::FFTAnalyzer (int n_channels) :
+  m_n_channels (n_channels)
+{
+  /* generate analysis window */
+  m_window.resize (Params::frame_size);
+
+  double window_weight = 0;
+  for (size_t i = 0; i < Params::frame_size; i++)
+    {
+      const double fsize_2 = Params::frame_size / 2.0;
+      // const double win =  window_cos ((i - fsize_2) / fsize_2);
+      const double win = window_hamming ((i - fsize_2) / fsize_2);
+      //const double win = 1;
+      m_window[i] = win;
+      window_weight += win;
+    }
+
+  /* normalize window using window weight */
+  for (size_t i = 0; i < Params::frame_size; i++)
+    {
+      m_window[i] *= 2.0 / window_weight;
+    }
+
+  /* allocate properly aligned buffers for SIMD */
+  m_frame  = new_array_float (Params::frame_size);
+  m_frame_fft = new_array_float (Params::frame_size);
+}
+
+FFTAnalyzer::~FFTAnalyzer()
+{
+  free_array_float (m_frame);
+  free_array_float (m_frame_fft);
+}
+
+vector<vector<complex<float>>>
+FFTAnalyzer::run_fft (const vector<float>& samples, size_t start_index)
+{
+  assert (samples.size() >= (Params::frame_size + start_index) * m_n_channels);
+
+  vector<vector<complex<float>>> fft_out;
+  for (int ch = 0; ch < m_n_channels; ch++)
+    {
+      size_t pos = start_index * m_n_channels + ch;
+      assert (pos + (Params::frame_size - 1) * m_n_channels < samples.size());
+
+      /* deinterleave frame data and apply window */
+      for (size_t x = 0; x < Params::frame_size; x++)
+        {
+          m_frame[x] = samples[pos] * m_window[x];
+          pos += m_n_channels;
+        }
+      /* FFT transform */
+      fftar_float (Params::frame_size, m_frame, m_frame_fft);
+
+      /* complex<float> and frame_fft have the same layout in memory */
+      const complex<float> *first = (complex<float> *) m_frame_fft;
+      const complex<float> *last  = first + Params::frame_size / 2 + 1;
+      fft_out.emplace_back (first, last);
+    }
+
+  return fft_out;
+}
+
+vector<vector<complex<float>>>
+FFTAnalyzer::fft_range (const vector<float>& samples, size_t start_index, size_t frame_count)
+{
+  vector<vector<complex<float>>> fft_out;
+
+  /* if there is not enough space for frame_count values, return an error (empty vector) */
+  if (samples.size() < (start_index + frame_count * Params::frame_size) * m_n_channels)
+    return fft_out;
+
+  for (size_t f = 0; f < frame_count; f++)
+    {
+      const size_t frame_start = (f * Params::frame_size) + start_index;
+
+      vector<vector<complex<float>>> frame_result = run_fft (samples, frame_start);
+      for (auto& fr : frame_result)
+        fft_out.emplace_back (std::move (fr));
+    }
+  return fft_out;
+}
+
+int
+frame_pos (int f, bool sync)
+{
+  static vector<int> pos_vec;
+
+  if (pos_vec.empty())
+    {
+      int frame_count = mark_data_frame_count() + mark_sync_frame_count();
+      for (int i = 0; i < frame_count; i++)
+        pos_vec.push_back (i);
+
+      Random random (0, Random::Stream::frame_position);
+      random.shuffle (pos_vec);
+    }
+  if (sync)
+    {
+      assert (f >= 0 && size_t (f) < mark_sync_frame_count());
+
+      return pos_vec[f];
+    }
+  else
+    {
+      assert (f >= 0 && size_t (f) < mark_data_frame_count());
+
+      return pos_vec[f + mark_sync_frame_count()];
+    }
+}
+
+int
+sync_frame_pos (int f)
+{
+  return frame_pos (f, true);
+}
+
+int
+data_frame_pos (int f)
+{
+  return frame_pos (f, false);
+}
+
+size_t
+mark_data_frame_count()
+{
+  return conv_code_size (ConvBlockType::a, Params::payload_size) * Params::frames_per_bit;
+}
+
+size_t
+mark_sync_frame_count()
+{
+  return Params::sync_bits * Params::sync_frames_per_bit;
+}
+
+vector<MixEntry>
+gen_mix_entries()
+{
+  const int frame_count = mark_data_frame_count();
+  vector<MixEntry> mix_entries (frame_count * Params::bands_per_frame);
+
+  UpDownGen up_down_gen (Random::Stream::data_up_down);
+  int entry = 0;
+  for (int f = 0; f < frame_count; f++)
+    {
+      const int index = data_frame_pos (f);
+      UpDownArray up, down;
+      up_down_gen.get (f, up, down);
+
+      assert (up.size() == down.size());
+      for (size_t i = 0; i < up.size(); i++)
+        mix_entries[entry++] = { index, up[i], down[i] };
+    }
+  Random random (/* seed */ 0, Random::Stream::mix);
+  random.shuffle (mix_entries);
+
+  return mix_entries;
+}
+
+
--- a/src/wmcommon.hh
+++ b/src/wmcommon.hh
+#ifndef AUDIOWMARK_WM_COMMON_HH
+#define AUDIOWMARK_WM_COMMON_HH
+
+#include <array>
+#include <complex>
+
+#include "random.hh"
+#include "rawinputstream.hh"
+
+#include <assert.h>
+
+enum class Format { AUTO = 1, RAW = 2 };
+
+class Params
+{
+public:
+  static constexpr size_t frame_size      = 1024;
+  static           int    frames_per_bit;
+  static constexpr size_t bands_per_frame = 30;
+  static constexpr int max_band          = 100;
+  static constexpr int min_band          = 20;
+
+  static           double water_delta;
+  static           bool mix;
+  static           bool hard;                      // hard decode bits? (soft decoding is better)
+  static           bool snr;                       // compute/show snr while adding watermark
+  static           int  have_key;
+
+  static constexpr size_t payload_size     = 128;  // number of payload bits for the watermark
+
+  static constexpr int sync_bits           = 6;
+  static constexpr int sync_frames_per_bit = 85;
+  static constexpr int sync_search_step    = 256;
+  static constexpr int sync_search_fine    = 8;
+  static constexpr double sync_threshold2  = 0.7; // minimum refined quality
+
+  static constexpr size_t frames_pad_start = 250; // padding at start, in case track starts with silence
+  static constexpr int mark_sample_rate = 44100; // watermark generation and detection sample rate
+
+  static           int test_cut; // for sync test
+  static           bool test_no_sync;
+  static           int test_truncate;
+
+  static           Format input_format;
+  static           Format output_format;
+
+  static           RawFormat raw_input_format;
+  static           RawFormat raw_output_format;
+};
+
+typedef std::array<int, 30> UpDownArray;
+class UpDownGen
+{
+  Random::Stream    random_stream;
+  Random            random;
+  std::vector<int>  bands_reorder;
+
+public:
+  UpDownGen (Random::Stream random_stream) :
+    random_stream (random_stream),
+    random (0, random_stream),
+    bands_reorder (Params::max_band - Params::min_band + 1)
+  {
+    UpDownArray x;
+    assert (x.size() == Params::bands_per_frame);
+  }
+  void
+  get (int f, UpDownArray& up, UpDownArray& down)
+  {
+    for (size_t i = 0; i < bands_reorder.size(); i++)
+      bands_reorder[i] = Params::min_band + i;
+
+    random.seed (f, random_stream); // use per frame random seed
+    random.shuffle (bands_reorder);
+
+    assert (2 * Params::bands_per_frame < bands_reorder.size());
+    for (size_t i = 0; i < Params::bands_per_frame; i++)
+      {
+        up[i]   = bands_reorder[i];
+        down[i] = bands_reorder[Params::bands_per_frame + i];
+      }
+  }
+};
+
+class FFTAnalyzer
+{
+  int           m_n_channels = 0;
+  std::vector<float> m_window;
+  float        *m_frame = nullptr;
+  float        *m_frame_fft = nullptr;
+public:
+  FFTAnalyzer (int n_channels);
+  ~FFTAnalyzer();
+
+  std::vector<std::vector<std::complex<float>>> run_fft (const std::vector<float>& samples, size_t start_index);
+  std::vector<std::vector<std::complex<float>>> fft_range (const std::vector<float>& samples, size_t start_index, size_t frame_count);
+};
+
+struct MixEntry
+{
+  int  frame;
+  int  up;
+  int  down;
+};
+
+std::vector<MixEntry> gen_mix_entries();
+
+double db_from_factor (double factor, double min_dB);
+
+size_t mark_data_frame_count();
+size_t mark_sync_frame_count();
+
+int sync_frame_pos (int f);
+int data_frame_pos (int f);
+
+template<class T> std::vector<T>
+randomize_bit_order (const std::vector<T>& bit_vec, bool encode)
+{
+  std::vector<unsigned int> order;
+
+  for (size_t i = 0; i < bit_vec.size(); i++)
+    order.push_back (i);
+
+  Random random (/* seed */ 0, Random::Stream::bit_order);
+  random.shuffle (order);
+
+  std::vector<T> out_bits (bit_vec.size());
+  for (size_t i = 0; i < bit_vec.size(); i++)
+    {
+      if (encode)
+        out_bits[i] = bit_vec[order[i]];
+      else
+        out_bits[order[i]] = bit_vec[i];
+    }
+  return out_bits;
+}
+
+int add_watermark (const std::string& infile, const std::string& outfile, const std::string& bits);
+int get_watermark (const std::string& infile, const std::string& orig_pattern);
+
+#endif /* AUDIOWMARK_WM_COMMON_HH */
--- a/src/wmget.cc
+++ b/src/wmget.cc
+#include <string>
+#include <algorithm>
+
+#include <zita-resampler/resampler.h>
+#include <zita-resampler/vresampler.h>
+
+#include "wavdata.hh"
+#include "wmcommon.hh"
+#include "convcode.hh"
+
+using std::string;
+using std::vector;
+using std::min;
+using std::max;
+using std::complex;
+
+template<class R>
+static void
+process_resampler (R& resampler, const vector<float>& in, vector<float>& out)
+{
+  resampler.out_count = out.size() / resampler.nchan();
+  resampler.out_data = &out[0];
+
+  /* avoid timeshift: zita needs k/2 - 1 samples before the actual input */
+  resampler.inp_count = resampler.inpsize () / 2 - 1;
+  resampler.inp_data  = nullptr;
+  resampler.process();
+
+  resampler.inp_count = in.size() / resampler.nchan();
+  resampler.inp_data = (float *) &in[0];
+  resampler.process();
+
+  /* zita needs k/2 samples after the actual input */
+  resampler.inp_count = resampler.inpsize() / 2;
+  resampler.inp_data  = nullptr;
+  resampler.process();
+}
+
+static WavData
+resample (const WavData& wav_data, int rate)
+{
+  /* in our application, resampling should only be called if it is necessary
+   * since using the resampler with input rate == output rate would be slow
+   */
+  assert (rate != wav_data.sample_rate());
+
+  const int hlen = 16;
+  const double ratio = double (rate) / wav_data.sample_rate();
+
+  const vector<float>& in = wav_data.samples();
+  vector<float> out (lrint (in.size() / wav_data.n_channels() * ratio) * wav_data.n_channels());
+
+  /* zita-resampler provides two resampling algorithms
+   *
+   * a fast optimized version: Resampler
+   *   this is an optimized version, which works for many common cases,
+   *   like resampling between 22050, 32000, 44100, 48000, 96000 Hz
+   *
+   * a slower version: VResampler
+   *   this works for arbitary rates (like 33333 -> 44100 resampling)
+   *
+   * so we try using Resampler, and if that fails fall back to VResampler
+   */
+  Resampler resampler;
+  if (resampler.setup (wav_data.sample_rate(), rate, wav_data.n_channels(), hlen) == 0)
+    {
+      process_resampler (resampler, in, out);
+      return WavData (out, wav_data.n_channels(), rate, wav_data.bit_depth());
+    }
+
+  VResampler vresampler;
+  if (vresampler.setup (ratio, wav_data.n_channels(), hlen) == 0)
+    {
+      process_resampler (vresampler, in, out);
+      return WavData (out, wav_data.n_channels(), rate, wav_data.bit_depth());
+    }
+  fprintf (stderr, "audiowmark: resampling from rate %d to rate %d not supported.\n", wav_data.sample_rate(), rate);
+  exit (1);
+}
+
+static int
+frame_count (const WavData& wav_data)
+{
+  return wav_data.n_values() / wav_data.n_channels() / Params::frame_size;
+}
+
+static vector<float>
+normalize_soft_bits (const vector<float>& soft_bits)
+{
+  vector<float> norm_soft_bits;
+
+  /* soft decoding produces better error correction than hard decoding */
+  if (Params::hard)
+    {
+      for (auto value : soft_bits)
+        norm_soft_bits.push_back (value > 0 ? 1.0 : 0.0);
+    }
+  else
+    {
+      /* figure out average level of each bit */
+      double mean = 0;
+      for (auto value : soft_bits)
+        mean += fabs (value);
+      mean /= soft_bits.size();
+
+      /* rescale from [-mean,+mean] to [0.0,1.0] */
+      for (auto value : soft_bits)
+        norm_soft_bits.push_back (0.5 * (value / mean + 1));
+    }
+
+  return norm_soft_bits;
+}
+
+static vector<float>
+mix_decode (vector<vector<complex<float>>>& fft_out, int n_channels)
+{
+  vector<float> raw_bit_vec;
+
+  const int frame_count = mark_data_frame_count();
+
+  vector<MixEntry> mix_entries = gen_mix_entries();
+
+  double umag = 0, dmag = 0;
+  for (int f = 0; f < frame_count; f++)
+    {
+      for (int ch = 0; ch < n_channels; ch++)
+        {
+          for (size_t frame_b = 0; frame_b < Params::bands_per_frame; frame_b++)
+            {
+              int b = f * Params::bands_per_frame + frame_b;
+              const double min_db = -96;
+
+              const size_t index = mix_entries[b].frame * n_channels + ch;
+              const int u = mix_entries[b].up;
+              const int d = mix_entries[b].down;
+
+              umag += db_from_factor (abs (fft_out[index][u]), min_db);
+              dmag += db_from_factor (abs (fft_out[index][d]), min_db);
+            }
+        }
+      if ((f % Params::frames_per_bit) == (Params::frames_per_bit - 1))
+        {
+          raw_bit_vec.push_back (umag - dmag);
+          umag = 0;
+          dmag = 0;
+        }
+    }
+  return raw_bit_vec;
+}
+
+static vector<float>
+linear_decode (vector<vector<complex<float>>>& fft_out, int n_channels)
+{
+  UpDownGen     up_down_gen (Random::Stream::data_up_down);
+  vector<float> raw_bit_vec;
+
+  const int frame_count = mark_data_frame_count();
+
+  double umag = 0, dmag = 0;
+  for (int f = 0; f < frame_count; f++)
+    {
+      for (int ch = 0; ch < n_channels; ch++)
+        {
+          const size_t index = data_frame_pos (f) * n_channels + ch;
+          UpDownArray up, down;
+          up_down_gen.get (f, up, down);
+
+          const double min_db = -96;
+          for (auto u : up)
+            umag += db_from_factor (abs (fft_out[index][u]), min_db);
+
+          for (auto d : down)
+            dmag += db_from_factor (abs (fft_out[index][d]), min_db);
+        }
+      if ((f % Params::frames_per_bit) == (Params::frames_per_bit - 1))
+        {
+          raw_bit_vec.push_back (umag - dmag);
+          umag = 0;
+          dmag = 0;
+        }
+    }
+  return raw_bit_vec;
+}
+
+static double
+normalize_sync_quality (double raw_quality)
+{
+  /* the quality for a good sync block depends on watermark strength
+   *
+   * this is just an approximation, but it should be good enough to be able to
+   * use one single threshold on the normalized value check if we have a sync
+   * block or not - typical output is 1.0 or more for sync blocks and close
+   * to 0.0 for non-sync blocks
+   */
+  return raw_quality / min (Params::water_delta, 0.080) / 2.9;
+}
+
+class SyncFinder
+{
+  vector<vector<int>> up;
+  vector<vector<int>> down;
+
+  void
+  init_up_down (const WavData& wav_data)
+  {
+    up.clear();
+    down.clear();
+
+    up.resize (Params::sync_bits);
+    down.resize (Params::sync_bits);
+
+    UpDownGen up_down_gen (Random::Stream::sync_up_down);
+    size_t n_bands = Params::max_band - Params::min_band + 1;
+    for (int bit = 0; bit < Params::sync_bits; bit++)
+      {
+        for (int f = 0; f < Params::sync_frames_per_bit; f++)
+          {
+            UpDownArray frame_up, frame_down;
+            up_down_gen.get (f + bit * Params::sync_frames_per_bit, frame_up, frame_down);
+
+            for (auto u : frame_up)
+              up[bit].push_back (u - Params::min_band + sync_frame_pos (f + bit * Params::sync_frames_per_bit) * n_bands * wav_data.n_channels());
+
+            for (auto d : frame_down)
+              down[bit].push_back (d - Params::min_band + sync_frame_pos (f + bit * Params::sync_frames_per_bit) * n_bands * wav_data.n_channels());
+          }
+        sort (up[bit].begin(), up[bit].end());
+        sort (down[bit].begin(), down[bit].end());
+      }
+  }
+  double
+  sync_decode (const WavData& wav_data, const size_t start_frame, const vector<float>& fft_out_db, ConvBlockType *block_type)
+  {
+    double sync_quality = 0;
+
+    size_t n_bands = Params::max_band - Params::min_band + 1;
+    for (int bit = 0; bit < Params::sync_bits; bit++)
+      {
+        float umag = 0, dmag = 0;
+
+        for (int ch = 0; ch < wav_data.n_channels(); ch++)
+          {
+            const int index = (start_frame * wav_data.n_channels() + ch) * n_bands;
+
+            for (size_t i = 0; i < up[bit].size(); i++)
+              {
+                umag += fft_out_db[index + up[bit][i]];
+                dmag += fft_out_db[index + down[bit][i]];
+              }
+          }
+        /* convert avoiding bias, raw_bit < 0 => 0 bit received; raw_bit > 0 => 1 bit received */
+        double raw_bit;
+        if (umag < dmag)
+          {
+            raw_bit = 1 - umag / dmag;
+          }
+        else
+          {
+            raw_bit = dmag / umag - 1;
+          }
+
+        const int expect_data_bit = bit & 1; /* expect 010101 */
+        const double q = expect_data_bit ? raw_bit : -raw_bit;
+        sync_quality += q;
+      }
+    sync_quality /= Params::sync_bits;
+    sync_quality = normalize_sync_quality (sync_quality);
+
+    if (sync_quality < 0)
+      {
+        *block_type = ConvBlockType::b;
+        return -sync_quality;
+      }
+    else
+      {
+        *block_type = ConvBlockType::a;
+        return sync_quality;
+      }
+  }
+public:
+  struct Score {
+    size_t        index;
+    double        quality;
+    ConvBlockType block_type;
+  };
+  vector<Score>
+  search (const WavData& wav_data)
+  {
+    vector<Score> result_scores;
+    vector<Score> sync_scores;
+
+    if (Params::test_no_sync)
+      {
+        const size_t expect0 = Params::frames_pad_start * Params::frame_size;
+        const size_t expect_step = (mark_sync_frame_count() + mark_data_frame_count()) * Params::frame_size;
+        const size_t expect_end = frame_count (wav_data) * Params::frame_size;
+
+        int ab = 0;
+        for (size_t expect_index = expect0; expect_index + expect_step < expect_end; expect_index += expect_step)
+          result_scores.push_back (Score { expect_index, 1.0, (ab++ & 1) ? ConvBlockType::b : ConvBlockType::a });
+
+        return result_scores;
+      }
+    init_up_down (wav_data);
+
+    vector<float> fft_db;
+
+    // compute multiple time-shifted fft vectors
+    size_t n_bands = Params::max_band - Params::min_band + 1;
+    for (size_t sync_shift = 0; sync_shift < Params::frame_size; sync_shift += Params::sync_search_step)
+      {
+        sync_fft (wav_data, sync_shift, frame_count (wav_data) - 1, fft_db, /* want all frames */ {});
+        for (int start_frame = 0; start_frame < frame_count (wav_data); start_frame++)
+          {
+            const size_t sync_index = start_frame * Params::frame_size + sync_shift;
+            if ((start_frame + mark_sync_frame_count() + mark_data_frame_count()) * wav_data.n_channels() * n_bands < fft_db.size())
+              {
+                ConvBlockType block_type;
+                double quality = sync_decode (wav_data, start_frame, fft_db, &block_type);
+                // printf ("%zd %f\n", sync_index, quality);
+                sync_scores.emplace_back (Score { sync_index, quality, block_type });
+              }
+          }
+      }
+    sort (sync_scores.begin(), sync_scores.end(), [] (const Score& a, const Score &b) { return a.index < b.index; });
+
+    vector<int> want_frames (mark_sync_frame_count() + mark_data_frame_count());
+    for (size_t f = 0; f < mark_sync_frame_count(); f++)
+      want_frames[sync_frame_pos (f)] = 1;
+
+    /* for strength 8 and above:
+     *   -> more false positive candidates are rejected, so we can use a lower threshold
+     *
+     * for strength 7 and below:
+     *   -> we need a higher threshold, because otherwise watermark detection takes too long
+     */
+    const double strength = Params::water_delta * 1000;
+    const double sync_threshold1 = strength > 7.5 ? 0.4 : 0.5;
+
+    for (size_t i = 0; i < sync_scores.size(); i++)
+      {
+        // printf ("%zd %f\n", sync_scores[i].index, sync_scores[i].quality);
+        if (sync_scores[i].quality > sync_threshold1)
+          {
+            double q_last = -1;
+            double q_next = -1;
+
+            if (i > 0)
+              q_last = sync_scores[i - 1].quality;
+
+            if (i + 1 < sync_scores.size())
+              q_next = sync_scores[i + 1].quality;
+
+            if (sync_scores[i].quality > q_last && sync_scores[i].quality > q_next)
+              {
+                //printf ("%zd %s %f", sync_scores[i].index, find_closest_sync (sync_scores[i].index), sync_scores[i].quality);
+
+                // refine match
+                double best_quality       = sync_scores[i].quality;
+                size_t best_index         = sync_scores[i].index;
+                ConvBlockType best_block_type = sync_scores[i].block_type; /* doesn't really change during refinement */
+
+                int start = std::max (int (sync_scores[i].index) - Params::sync_search_step, 0);
+                int end   = sync_scores[i].index + Params::sync_search_step;
+                for (int fine_index = start; fine_index <= end; fine_index += Params::sync_search_fine)
+                  {
+                    sync_fft (wav_data, fine_index, mark_sync_frame_count() + mark_data_frame_count(), fft_db, want_frames);
+                    if (fft_db.size())
+                      {
+                        ConvBlockType block_type;
+                        double        q = sync_decode (wav_data, 0, fft_db, &block_type);
+
+                        if (q > best_quality)
+                          {
+                            best_quality = q;
+                            best_index   = fine_index;
+                          }
+                      }
+                  }
+                //printf (" => refined: %zd %s %f\n", best_index, find_closest_sync (best_index), best_quality);
+                if (best_quality > Params::sync_threshold2)
+                  result_scores.push_back (Score { best_index, best_quality, best_block_type });
+              }
+          }
+      }
+    return result_scores;
+  }
+private:
+  void
+  sync_fft (const WavData& wav_data, size_t index, size_t frame_count, vector<float>& fft_out_db, const vector<int>& want_frames)
+  {
+    fft_out_db.clear();
+
+    /* read past end? -> fail */
+    if (wav_data.n_values() < (index + frame_count * Params::frame_size) * wav_data.n_channels())
+      return;
+
+    FFTAnalyzer fft_analyzer (wav_data.n_channels());
+    const vector<float>& samples = wav_data.samples();
+    const size_t n_bands = Params::max_band - Params::min_band + 1;
+    int out_pos = 0;
+
+    fft_out_db.resize (wav_data.n_channels() * n_bands * frame_count);
+
+    for (size_t f = 0; f < frame_count; f++)
+      {
+        const double min_db = -96;
+        if (want_frames.size() && !want_frames[f])
+          {
+            for (int ch = 0; ch < wav_data.n_channels(); ch++)
+              for (int i = Params::min_band; i <= Params::max_band; i++)
+                fft_out_db[out_pos++] = min_db;
+          }
+        else
+          {
+            vector<vector<complex<float>>> frame_result = fft_analyzer.run_fft (samples, index + f * Params::frame_size);
+
+            /* computing db-magnitude is expensive, so we better do it here */
+            for (int ch = 0; ch < wav_data.n_channels(); ch++)
+              for (int i = Params::min_band; i <= Params::max_band; i++)
+                fft_out_db[out_pos++] = db_from_factor (abs (frame_result[ch][i]), min_db);
+          }
+      }
+  }
+
+  const char*
+  find_closest_sync (size_t index)
+  {
+    int best_error = 0xffff;
+    int best = 0;
+
+    for (int i = 0; i < 100; i++)
+      {
+        int error = abs (int (index) - int (i * Params::sync_bits * Params::sync_frames_per_bit * Params::frame_size));
+        if (error < best_error)
+          {
+            best = i;
+            best_error = error;
+          }
+      }
+    static char buffer[1024]; // this code is for debugging only, so this should be ok
+    sprintf (buffer, "n:%d offset:%d", best, int (index) - int (best * Params::sync_bits * Params::sync_frames_per_bit * Params::frame_size));
+    return buffer;
+  }
+};
+
+static int
+decode_and_report (const WavData& wav_data, const string& orig_pattern)
+{
+  int match_count = 0, total_count = 0, sync_match = 0;
+
+  SyncFinder                sync_finder;
+  vector<SyncFinder::Score> sync_scores = sync_finder.search (wav_data);
+
+  auto report_pattern = [&] (SyncFinder::Score sync_score, const vector<int>& bit_vec, float decode_error)
+  {
+    if (sync_score.index)
+      {
+        const char *block_str = nullptr;
+
+        switch (sync_score.block_type)
+          {
+            case ConvBlockType::a:  block_str = "A";
+                                    break;
+            case ConvBlockType::b:  block_str = "B";
+                                    break;
+            case ConvBlockType::ab: block_str = "AB";
+                                    break;
+          }
+        const int seconds = sync_score.index / wav_data.sample_rate();
+        printf ("pattern %2d:%02d %s %.3f %.3f %s\n", seconds / 60, seconds % 60, bit_vec_to_str (bit_vec).c_str(),
+                sync_score.quality, decode_error, block_str);
+      }
+    else /* this is the combined pattern "all" */
+      {
+        printf ("pattern   all %s %.3f %.3f\n", bit_vec_to_str (bit_vec).c_str(), sync_score.quality, decode_error);
+      }
+    if (!orig_pattern.empty())
+      {
+        bool        match = true;
+        vector<int> orig_vec = bit_str_to_vec (orig_pattern);
+
+        for (size_t i = 0; i < bit_vec.size(); i++)
+          match = match && (bit_vec[i] == orig_vec[i % orig_vec.size()]);
+
+        if (match)
+          match_count++;
+
+      }
+    total_count++;
+  };
+
+  vector<float> raw_bit_vec_all (conv_code_size (ConvBlockType::ab, Params::payload_size));
+  vector<int>   raw_bit_vec_norm (2);
+
+  SyncFinder::Score score_all { 0, 0 };
+  SyncFinder::Score score_ab  { 0, 0, ConvBlockType::ab };
+
+  ConvBlockType last_block_type = ConvBlockType::b;
+  vector<vector<float>> ab_raw_bit_vec (2);
+  vector<float>         ab_quality (2);
+  FFTAnalyzer           fft_analyzer (wav_data.n_channels());
+  for (auto sync_score : sync_scores)
+    {
+      const size_t count = mark_sync_frame_count() + mark_data_frame_count();
+      const size_t index = sync_score.index;
+      const int    ab = (sync_score.block_type == ConvBlockType::b); /* A -> 0, B -> 1 */
+
+      auto fft_range_out = fft_analyzer.fft_range (wav_data.samples(), index, count);
+      if (fft_range_out.size())
+        {
+          /* ---- retrieve bits from watermark ---- */
+          vector<float> raw_bit_vec;
+          if (Params::mix)
+            {
+              raw_bit_vec = mix_decode (fft_range_out, wav_data.n_channels());
+            }
+          else
+            {
+              raw_bit_vec = linear_decode (fft_range_out, wav_data.n_channels());
+            }
+          assert (raw_bit_vec.size() == conv_code_size (ConvBlockType::a, Params::payload_size));
+
+          raw_bit_vec = randomize_bit_order (raw_bit_vec, /* encode */ false);
+
+          /* ---- deal with this pattern ---- */
+          float decode_error = 0;
+          vector<int> bit_vec = conv_decode_soft (sync_score.block_type, normalize_soft_bits (raw_bit_vec), &decode_error);
+
+          report_pattern (sync_score, bit_vec, decode_error);
+
+          /* ---- update "all" pattern ---- */
+          score_all.quality += sync_score.quality;
+
+          for (size_t i = 0; i < raw_bit_vec.size(); i++)
+            {
+              raw_bit_vec_all[i * 2 + ab] += raw_bit_vec[i];
+            }
+          raw_bit_vec_norm[ab]++;
+
+          /* ---- if last block was A & this block is B => deal with combined AB block */
+          ab_raw_bit_vec[ab] = raw_bit_vec;
+          ab_quality[ab]     = sync_score.quality;
+          if (last_block_type == ConvBlockType::a && sync_score.block_type == ConvBlockType::b)
+            {
+              /* join A and B block -> AB block */
+              vector<float> ab_bits (raw_bit_vec.size() * 2);
+              for (size_t i = 0; i <  raw_bit_vec.size(); i++)
+                {
+                  ab_bits[i * 2] = ab_raw_bit_vec[0][i];
+                  ab_bits[i * 2 + 1] = ab_raw_bit_vec[1][i];
+                }
+              vector<int> bit_vec = conv_decode_soft (ConvBlockType::ab, normalize_soft_bits (ab_bits), &decode_error);
+              score_ab.index = sync_score.index;
+              score_ab.quality = (ab_quality[0] + ab_quality[1]) / 2;
+              report_pattern (score_ab, bit_vec, decode_error);
+            }
+          last_block_type = sync_score.block_type;
+        }
+    }
+  if (total_count > 1) /* all pattern: average soft bits of all watermarks and decode */
+    {
+      for (size_t i = 0; i < raw_bit_vec_all.size(); i += 2)
+        {
+          raw_bit_vec_all[i]     /= max (raw_bit_vec_norm[0], 1); /* normalize A soft bits with number of A blocks */
+          raw_bit_vec_all[i + 1] /= max (raw_bit_vec_norm[1], 1); /* normalize B soft bits with number of B blocks */
+        }
+      score_all.quality /= raw_bit_vec_norm[0] + raw_bit_vec_norm[1];
+
+      vector<float> soft_bit_vec = normalize_soft_bits (raw_bit_vec_all);
+
+      float decode_error = 0;
+      vector<int> bit_vec = conv_decode_soft (ConvBlockType::ab, soft_bit_vec, &decode_error);
+
+      report_pattern (score_all, bit_vec, decode_error);
+    }
+
+  if (!orig_pattern.empty())
+    {
+      printf ("match_count %d %d\n", match_count, total_count);
+
+      /* search sync markers at typical positions */
+      const int expect0 = Params::frames_pad_start * Params::frame_size;
+      const int expect_step = (mark_sync_frame_count() + mark_data_frame_count()) * Params::frame_size;
+      const int expect_end = frame_count (wav_data) * Params::frame_size;
+
+      for (int expect_index = expect0; expect_index + expect_step < expect_end; expect_index += expect_step)
+        {
+          for (auto sync_score : sync_scores)
+            {
+              if (abs (int (sync_score.index + Params::test_cut) - expect_index) < Params::frame_size / 2)
+                {
+                  sync_match++;
+                  break;
+                }
+            }
+        }
+      printf ("sync_match %d %zd\n", sync_match, sync_scores.size());
+    }
+  return 0;
+}
+
+int
+get_watermark (const string& infile, const string& orig_pattern)
+{
+  WavData wav_data;
+  Error err = wav_data.load (infile);
+  if (err)
+    {
+      fprintf (stderr, "audiowmark: error loading %s: %s\n", infile.c_str(), err.message());
+      return 1;
+    }
+
+  if (Params::test_truncate)
+    {
+      const size_t  want_n_samples = wav_data.sample_rate() * wav_data.n_channels() * Params::test_truncate;
+      vector<float> short_samples  = wav_data.samples();
+
+      if (want_n_samples < short_samples.size())
+        {
+          short_samples.resize (want_n_samples);
+          wav_data.set_samples (short_samples);
+        }
+    }
+  if (wav_data.sample_rate() == Params::mark_sample_rate)
+    {
+      return decode_and_report (wav_data, orig_pattern);
+    }
+  else
+    {
+      return decode_and_report (resample (wav_data, Params::mark_sample_rate), orig_pattern);
+    }
+}