commit cec3e89: [Feature] Speed up is_ascii function

Vsevolod Stakhov vsevolod at highsecure.ru
Mon Jan 27 18:28:06 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-01-27 18:05:15 +0000
URL: https://github.com/rspamd/rspamd/commit/cec3e89b046705b1ce488626ad059ce4b4ea9b6b

[Feature] Speed up is_ascii function

---
 src/libutil/str_util.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/libutil/str_util.h | 34 +----------------------
 2 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 8fcaca484..5cee63baf 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -3324,4 +3324,75 @@ rspamd_string_len_split (const gchar *in, gsize len, const gchar *spill,
 	}
 
 	return res;
-}
\ No newline at end of file
+}
+
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+static inline gboolean
+rspamd_str_has_8bit_u64 (const guchar *beg, gsize len)
+{
+	guint8 orb = 0;
+
+	if (len >= 16) {
+		const guchar *nextd = beg+8;
+		guint64 n1 = 0, n2 = 0;
+
+		do {
+			n1 |= *(const guint64 *)beg;
+			n2 |= *(const guint64 *)nextd;
+			beg += 16;
+			nextd += 16;
+			len -= 16;
+		} while (len >= 16);
+
+		/*
+		 * Idea from Benny Halevy <bhalevy at scylladb.com>
+		 * - 7-th bit set   ==> orb = !(non-zero) - 1 = 0 - 1 = 0xFF
+		 * - 7-th bit clear ==> orb = !0 - 1          = 1 - 1 = 0x00
+		 */
+		orb = !((n1 | n2) & 0x8080808080808080ULL) - 1;
+	}
+
+	while (len--) {
+		orb |= *beg++;
+	}
+
+	return orb >= 0x80;
+}
+
+gboolean
+rspamd_str_has_8bit (const guchar *beg, gsize len)
+{
+#if defined(__x86_64__)
+	if (len >= 32) {
+		const uint8_t *nextd = beg + 16;
+
+		__m128i n1 = _mm_set1_epi8 (0), n2;
+
+		n2 = n1;
+
+		while (len >= 32) {
+			__m128i xmm1 = _mm_lddqu_si128 ((const __m128i *)beg);
+			__m128i xmm2 = _mm_lddqu_si128 ((const __m128i *)nextd);
+
+			n1 = _mm_or_si128 (n1, xmm1);
+			n2 = _mm_or_si128 (n2, xmm2);
+
+			beg += 32;
+			nextd += 32;
+			len -= 32;
+		}
+
+		n1 = _mm_or_si128 (n1, n2);
+
+		/* We assume 2 complement here */
+		if (_mm_movemask_epi8 (n1)) {
+			return TRUE;
+		}
+	}
+#endif
+
+	return rspamd_str_has_8bit_u64 (beg, len);
+}
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index c08dd55bb..22643176b 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -440,39 +440,7 @@ gsize rspamd_memspn (const gchar *s, const gchar *e, gsize len);
  */
 #define rspamd_is_aligned(p, n) (((uintptr_t)(p) & ((uintptr_t)(n) - 1)) == 0)
 #define rspamd_is_aligned_as(p, v) rspamd_is_aligned(p, _Alignof(__typeof((v))))
-
-static inline gboolean
-rspamd_str_has_8bit (const guchar *beg, gsize len)
-{
-	unsigned long *w;
-	gsize i, leftover;
-
-	if (rspamd_is_aligned_as (beg, *w)) {
-		leftover = len % sizeof (*w);
-		w = (unsigned long *) beg;
-
-		for (i = 0; i < len / sizeof (*w); i++) {
-			if (rspamd_str_hasmore (*w, 127)) {
-				return TRUE;
-			}
-
-			w++;
-		}
-
-		beg = (const guchar *) w;
-	}
-	else {
-		leftover = len;
-	}
-
-	for (i = 0; i < leftover; i++) {
-		if (beg[i] > 127) {
-			return TRUE;
-		}
-	}
-
-	return FALSE;
-}
+gboolean rspamd_str_has_8bit (const guchar *beg, gsize len);
 
 struct UConverter;
 


More information about the Commits mailing list