Commit dc1cee7d authored by Stefan Westerfeld's avatar Stefan Westerfeld

Merge branch 'speed-detection-opt'

* speed-detection-opt:
  Update speed detection comments.
  Check for positive index without fractional shift in speed detection.
  Avoid floating point math for speed detection offset computation.
  Determine speed search compare range [begin,end] before loop (performance).
  Improve performance for speed detection bits compare.
  Reorder memory access in speed detection (performance).
Signed-off-by: Stefan Westerfeld's avatarStefan Westerfeld <stefan@space.twc.de>
parents 267755d2 1e3dc560
...@@ -87,7 +87,7 @@ public: ...@@ -87,7 +87,7 @@ public:
Mags& Mags&
operator() (int row, int col) operator() (int row, int col)
{ {
return m_data[row * m_cols + col]; return m_data[col * m_rows + row];
} }
void void
resize (int rows, int cols) resize (int rows, int cols)
...@@ -129,14 +129,21 @@ public: ...@@ -129,14 +129,21 @@ public:
float dmag = 0; float dmag = 0;
int count = 0; int count = 0;
}; };
struct CmpState
{
int offset = 0;
BitValue bit_values[Params::sync_bits];
};
private: private:
static constexpr int OFFSET_SHIFT = 16;
vector<SyncBit> sync_bits; vector<SyncBit> sync_bits;
MagMatrix sync_matrix; MagMatrix sync_matrix;
void prepare_mags (const SpeedScanParams& scan_params); void prepare_mags (const SpeedScanParams& scan_params);
void compare (double relative_speed); void compare (double relative_speed);
template<bool B2> template<int BLOCK>
void compare_bits (BitValue *bit_values, double relative_speed, int *start_mi, int offset); void compare_bits (vector<CmpState>& cmp_states, double relative_speed);
std::mutex mutex; std::mutex mutex;
vector<Score> result_scores; vector<Score> result_scores;
...@@ -267,45 +274,62 @@ SpeedSync::prepare_mags (const SpeedScanParams& scan_params) ...@@ -267,45 +274,62 @@ SpeedSync::prepare_mags (const SpeedScanParams& scan_params)
assert (row == n_sync_rows); assert (row == n_sync_rows);
} }
template<bool B2> void template<int BLOCK> void
SpeedSync::compare_bits (BitValue *bit_values, double relative_speed, int *start_mi, int offset) SpeedSync::compare_bits (vector<CmpState>& cmp_states, double relative_speed)
{ {
const int steps_per_frame = Params::frame_size / Params::sync_search_step; const int steps_per_frame = Params::frame_size / Params::sync_search_step;
const double relative_speed_inv = 1 / relative_speed; const double relative_speed_inv = 1 / relative_speed;
/* search start */ auto begin = cmp_states.end();
int mi = *start_mi; auto end = cmp_states.end();
while (mi > 0) for (size_t mi = 0; mi < sync_bits.size(); mi++)
{
int index = offset + sync_bits[mi - 1].frame * steps_per_frame;
if (index < 0)
break;
mi--;
}
*start_mi = mi;
for (auto si = sync_bits.begin() + mi; si != sync_bits.end(); si++)
{ {
int index = (offset + si->frame * steps_per_frame) * relative_speed_inv + 0.5; const int frame_offset = ((BLOCK * frames_per_block + sync_bits[mi].frame) * steps_per_frame * relative_speed_inv + 0.5) * (1 << OFFSET_SHIFT);
if (index >= sync_matrix.rows())
return;
auto& bv = bit_values[si->bit]; while (begin > cmp_states.begin())
auto mags = sync_matrix (index, mi);
if (B2)
{ {
bv.umag += mags.dmag; auto prev = begin - 1;
bv.dmag += mags.umag;
/*
* don't use OFFSET_SHIFT here; just ensure that index is positive
* to ensure that shifted value will properly round to nearest frame
* later on
*/
int index = prev->offset + frame_offset;
if (index < 0)
break;
begin = prev;
} }
else while (end > cmp_states.begin())
{ {
bv.umag += mags.umag; auto prev = end - 1;
bv.dmag += mags.dmag;
int index = (prev->offset + frame_offset) >> OFFSET_SHIFT;
if (index < sync_matrix.rows())
break;
end = prev;
} }
bv.count++;
mi++; for (auto it = begin; it != end; it++)
{
int index = (it->offset + frame_offset) >> OFFSET_SHIFT;
auto& bv = it->bit_values[sync_bits[mi].bit];
auto mags = sync_matrix (index, mi);
if (BLOCK & 1)
{
bv.umag += mags.dmag;
bv.dmag += mags.umag;
}
else
{
bv.umag += mags.umag;
bv.dmag += mags.dmag;
}
bv.count++;
}
} }
} }
...@@ -314,33 +338,36 @@ SpeedSync::compare (double relative_speed) ...@@ -314,33 +338,36 @@ SpeedSync::compare (double relative_speed)
{ {
const int steps_per_frame = Params::frame_size / Params::sync_search_step; const int steps_per_frame = Params::frame_size / Params::sync_search_step;
const int pad_start = frames_per_block * steps_per_frame + /* add a bit of overlap to handle boundaries */ steps_per_frame; const int pad_start = frames_per_block * steps_per_frame + /* add a bit of overlap to handle boundaries */ steps_per_frame;
Score best_score;
assert (steps_per_frame * Params::sync_search_step == Params::frame_size); assert (steps_per_frame * Params::sync_search_step == Params::frame_size);
int start_mi1 = sync_bits.size(); vector<CmpState> cmp_states;
int start_mi2 = sync_bits.size(); for (int offset = -pad_start; offset < 0; offset++)
int start_mi3 = sync_bits.size();
for (int offset = -pad_start; offset < 0; offset++)
{ {
BitValue bit_values[Params::sync_bits]; CmpState cs;
cs.offset = offset * ((1 << OFFSET_SHIFT) / relative_speed);
cmp_states.push_back (cs);
}
/* /*
* we need to compare 3 blocks here: * we need to compare 3 blocks here:
* - one block is necessary because we need to test all offsets (-pad_start..0) * - one block is necessary because we need to test all offsets (-pad_start..0)
* - two more blocks are necessary since speed detection ScanParams uses 50 seconds at most, * - two more blocks are necessary since speed detection ScanParams uses 50 seconds at most,
* and short payload (12 bits) has a block length of slightly over 30 seconds * and short payload (12 bits) has a block length of slightly over 30 seconds
*/ */
compare_bits<false> (bit_values, relative_speed, &start_mi1, offset); compare_bits<0> (cmp_states, relative_speed);
compare_bits<true> (bit_values, relative_speed, &start_mi2, offset + frames_per_block * steps_per_frame); compare_bits<1> (cmp_states, relative_speed);
compare_bits<false> (bit_values, relative_speed, &start_mi3, offset + frames_per_block * steps_per_frame * 2); compare_bits<2> (cmp_states, relative_speed);
Score best_score;
for (const auto& cs : cmp_states)
{
double sync_quality = 0; double sync_quality = 0;
int bit_count = 0; int bit_count = 0;
for (size_t bit = 0; bit < Params::sync_bits; bit++) for (size_t bit = 0; bit < Params::sync_bits; bit++)
{ {
const auto& bv = bit_values[bit]; const auto& bv = cs.bit_values[bit];
sync_quality += SyncFinder::bit_quality (bv.umag, bv.dmag, bit) * bv.count; sync_quality += SyncFinder::bit_quality (bv.umag, bv.dmag, bit) * bv.count;
bit_count += bv.count; bit_count += bv.count;
...@@ -357,7 +384,6 @@ SpeedSync::compare (double relative_speed) ...@@ -357,7 +384,6 @@ SpeedSync::compare (double relative_speed)
} }
} }
} }
//printf ("%f %f\n", best_score.speed, best_score.quality);
std::lock_guard<std::mutex> lg (mutex); std::lock_guard<std::mutex> lg (mutex);
result_scores.push_back (best_score); result_scores.push_back (best_score);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment