summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthias Vogelgesang <matthias.vogelgesang@kit.edu>2015-06-23 09:21:18 +0200
committerMatthias Vogelgesang <matthias.vogelgesang@kit.edu>2015-06-23 09:21:18 +0200
commit3c671c08a7b8e01b030523b568998035138ddda4 (patch)
tree5ccd03d13966c44d3cb51812964fec3a3ce8cf37
parent420130400ccd0a3ca0a614e968fe90a004cb86e3 (diff)
downloadlibufodecode-3c671c08a7b8e01b030523b568998035138ddda4.tar.gz
libufodecode-3c671c08a7b8e01b030523b568998035138ddda4.tar.bz2
libufodecode-3c671c08a7b8e01b030523b568998035138ddda4.tar.xz
libufodecode-3c671c08a7b8e01b030523b568998035138ddda4.zip
Add SSE-based decoding of v6 format
Shaves off about 30% time.
-rw-r--r--src/ufodecode.c48
1 files changed, 46 insertions, 2 deletions
diff --git a/src/ufodecode.c b/src/ufodecode.c
index f86abfc..e00ff38 100644
--- a/src/ufodecode.c
+++ b/src/ufodecode.c
@@ -235,18 +235,61 @@ ufo_decode_frame_channels_v6 (UfoDecoder *decoder, uint16_t *pixel_buffer, uint3
size_t base = 0;
size_t index = 0;
const size_t space = 640;
+#ifdef HAVE_SSE
+ const __m64 mask_fff = _mm_set_pi32 (0xfff, 0xfff);
+ __m64 mm_r, src1, src2, src3;
+ uint32_t *result;
+
+ result = (uint32_t *) &mm_r;
+#endif
while (raw[base] != 0xAAAAAAA) {
const size_t row_number = raw[base] & 0xfff;
const size_t pixel_number = (raw[base + 1] >> 16) & 0xfff;
base += 2;
-
index = row_number * IPECAMERA_WIDTH_20MP + pixel_number;
+#ifdef HAVE_SSE
+ src1 = _mm_set_pi32 (raw[base], raw[base + 3]);
+ src2 = _mm_set_pi32 (raw[base + 1], raw[base + 4]);
+ src3 = _mm_set_pi32 (raw[base + 2], raw[base + 5]);
+
+ mm_r = _mm_srli_pi32 (src1, 20);
+ pixel_buffer[index + 0 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 0 * space] = result[1];
+
+ mm_r = _mm_and_si64 (_mm_srli_pi32 (src1, 8), mask_fff);
+ pixel_buffer[index + 1 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 1 * space] = result[1];
+
+ mm_r = _mm_or_si64 (_mm_and_si64 (_mm_slli_pi32 (src1, 4), mask_fff), _mm_srli_pi32 (src2, 28));
+ pixel_buffer[index + 2 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 2 * space] = result[1];
+
+ mm_r = _mm_and_si64 (_mm_srli_pi32 (src2, 16), mask_fff);
+ pixel_buffer[index + 3 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 3 * space] = result[1];
+
+ mm_r = _mm_and_si64 (_mm_srli_pi32 (src2, 4), mask_fff);
+ pixel_buffer[index + 4 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 4 * space] = result[1];
+
+ mm_r = _mm_or_si64 (_mm_and_si64 (_mm_slli_pi32 (src2, 8), mask_fff), _mm_srli_pi32 (src3, 24));
+ pixel_buffer[index + 5 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 5 * space] = result[1];
+
+ mm_r = _mm_and_si64 (_mm_srli_pi32 (src3, 12), mask_fff);
+ pixel_buffer[index + 6 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 6 * space] = result[1];
+
+ mm_r = _mm_and_si64 (src3, mask_fff);
+ pixel_buffer[index + 7 * space] = result[0];
+ pixel_buffer[index + IPECAMERA_WIDTH_20MP + 7 * space] = result[1];
+#else
pixel_buffer[index + 0 * space] = (raw[base] >> 20);
pixel_buffer[index + 1 * space] = (raw[base] >> 8) & 0xfff;
- pixel_buffer[index + 2 * space] = ((raw[base] << 4) & 0xff) | ((raw[base + 1] >> 28) & 0xff);
+ pixel_buffer[index + 2 * space] = ((raw[base] << 4) & 0xfff) | (raw[base + 1] >> 28);
pixel_buffer[index + 3 * space] = (raw[base + 1] >> 16) & 0xfff;
pixel_buffer[index + 4 * space] = (raw[base + 1] >> 4) & 0xfff;
pixel_buffer[index + 5 * space] = ((raw[base + 1] << 8) & 0xfff) | (raw[base + 2] >> 24);
@@ -262,6 +305,7 @@ ufo_decode_frame_channels_v6 (UfoDecoder *decoder, uint16_t *pixel_buffer, uint3
pixel_buffer[index + 5 * space] = ((raw[base + 4] << 8) & 0xfff) | (raw[base + 5] >> 24);
pixel_buffer[index + 6 * space] = (raw[base + 5] >> 12) & 0xfff;
pixel_buffer[index + 7 * space] = (raw[base + 5] & 0xfff);
+#endif
base += 6;
}