commit 963a8e9: [Fix] Various fixes to QP encoding algorithm

Mon Jul 22 14:49:04 UTC 2019

Author: Vsevolod Stakhov
Date: 2019-07-22 15:27:43 +0100
URL: https://github.com/rspamd/rspamd/commit/963a8e9f83693a153f2133e6b946493601c43d21

[Fix] Various fixes to QP encoding algorithm
Issue: #2957

---
 src/libutil/str_util.c | 180 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 144 insertions(+), 36 deletions(-)

diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 290110b2b..962ed1871 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -917,25 +917,61 @@ rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
 	return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how);
 }
 
+#define QP_RANGE(x) (((x) >= 33 && (x) <= 60) || ((x) >= 62 && (x) <= 126) \
+		|| (x) == '\r' || (x) == '\n' || (x) == ' ' || (x) == '\t')
+#define QP_SPAN_NORMAL(span, str_len) ((str_len) > 0 && \
+		((span) + 1) >= (str_len))
+#define QP_SPAN_SPECIAL(span, str_len) ((str_len) > 0 && \
+		((span) + 4) >= (str_len))
+
 gchar *
 rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
 						   gsize *outlen, enum rspamd_newlines_type how)
 {
-	gsize olen = 0, span = 0, i = 0;
+	gsize olen = 0, span = 0, i = 0, seen_spaces = 0;
 	gchar *out;
-	gint ch;
+	gint ch, last_sp;
 	const guchar *end = in + inlen, *p = in;
 	static const gchar hexdigests[16] = "0123456789ABCDEF";
 
 	while (p < end) {
 		ch = *p;
 
-		if (ch < 128 && ch != '\r' && ch != '\n') {
+		if (QP_RANGE(ch)) {
 			olen ++;
 			span ++;
+
+			if (ch == '\r' || ch == '\n') {
+				if (seen_spaces > 0) {
+					/* We must encode spaces at the end of line */
+					olen += 3;
+					seen_spaces = 0;
+					/* Special stuff for space character at the end */
+					if (QP_SPAN_SPECIAL(span, str_len)) {
+						if (how == RSPAMD_TASK_NEWLINES_CRLF) {
+							/* =\r\n */
+							olen += 3;
+						}
+						else {
+							olen += 2;
+						}
+					}
+					/* Continue with the same `ch` but without spaces logic */
+					continue;
+				}
+
+				span = 0;
+			}
+			else if (ch == ' ' || ch == '\t') {
+				seen_spaces ++;
+				last_sp = ch;
+			}
+			else {
+				seen_spaces = 0;
+			}
 		}
 		else {
-			if (str_len > 0 && span + 5 >= str_len) {
+			if (QP_SPAN_SPECIAL(span, str_len)) {
 				if (how == RSPAMD_TASK_NEWLINES_CRLF) {
 					/* =\r\n */
 					olen += 3;
@@ -950,7 +986,7 @@ rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
 			span += 3;
 		}
 
-		if (str_len > 0 && span + 3 >= str_len) {
+		if (QP_SPAN_NORMAL(span, str_len)) {
 			if (how == RSPAMD_TASK_NEWLINES_CRLF) {
 				/* =\r\n */
 				olen += 3;
@@ -964,21 +1000,112 @@ rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
 		p ++;
 	}
 
+	if (seen_spaces > 0) {
+		/* Reserve length for the last space encoded */
+		olen += 3;
+	}
+
 	out = g_malloc (olen + 1);
 	p = in;
 	i = 0;
 	span = 0;
+	seen_spaces = 0;
 
 	while (p < end) {
 		ch = *p;
 
-		if (ch < 128 && ch != '\r' && ch != '\n') {
+		if (QP_RANGE (ch)) {
+			if (ch == '\r' || ch == '\n') {
+				if (seen_spaces > 0) {
+					if (QP_SPAN_SPECIAL(span, str_len)) {
+						/* Add soft newline */
+						i --;
+
+						if (p + 1 < end || span + 3 >= str_len) {
+							switch (how) {
+							default:
+							case RSPAMD_TASK_NEWLINES_CRLF:
+								out[i++] = '=';
+								out[i++] = '\r';
+								out[i++] = '\n';
+								break;
+							case RSPAMD_TASK_NEWLINES_LF:
+								out[i++] = '=';
+								out[i++] = '\n';
+								break;
+							case RSPAMD_TASK_NEWLINES_CR:
+								out[i++] = '=';
+								out[i++] = '\r';
+								break;
+							}
+						}
+
+						/* Now write encoded `last_sp` but after newline */
+						out[i++] = '=';
+						out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
+						out[i++] = hexdigests[(last_sp & 0xF)];
+
+						span = 0;
+					}
+					else {
+						/* Encode last space */
+						--i;
+						out[i++] = '=';
+						out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
+						out[i++] = hexdigests[(last_sp & 0xF)];
+						seen_spaces = 0;
+					}
+
+					continue;
+				}
+				span = 0;
+			}
+			else if (ch == ' ' || ch == '\t') {
+				seen_spaces ++;
+				last_sp = ch;
+				span ++;
+			}
+			else {
+				seen_spaces = 0;
+				span ++;
+			}
+
 			out[i++] = ch;
-			span ++;
 		}
 		else {
-			if (str_len > 0 && span + 5 >= str_len) {
+			if (QP_SPAN_SPECIAL(span, str_len)) {
 				/* Add new line and then continue */
+				if (p + 1 < end || span + 3 >= str_len) {
+					switch (how) {
+					default:
+					case RSPAMD_TASK_NEWLINES_CRLF:
+						out[i++] = '=';
+						out[i++] = '\r';
+						out[i++] = '\n';
+						break;
+					case RSPAMD_TASK_NEWLINES_LF:
+						out[i++] = '=';
+						out[i++] = '\n';
+						break;
+					case RSPAMD_TASK_NEWLINES_CR:
+						out[i++] = '=';
+						out[i++] = '\r';
+						break;
+					}
+					span = 0;
+				}
+			}
+
+			out[i++] = '=';
+			out[i++] = hexdigests[((ch >> 4) & 0xF)];
+			out[i++] = hexdigests[(ch & 0xF)];
+			span += 3;
+			seen_spaces = 0;
+		}
+
+		if (QP_SPAN_NORMAL(span, str_len)) {
+			/* Add new line and then continue */
+			if (p + 1 < end || span > str_len || seen_spaces) {
 				switch (how) {
 				default:
 				case RSPAMD_TASK_NEWLINES_CRLF:
@@ -995,42 +1122,23 @@ rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
 					out[i++] = '\r';
 					break;
 				}
-
 				span = 0;
+				seen_spaces = 0;
 			}
-
-			out[i++] = '=';
-			out[i++] = hexdigests[((ch >> 4) & 0xF)];
-			out[i++] = hexdigests[(ch & 0xF)];
-			span += 3;
-		}
-
-		if (str_len > 0 && span + 3 >= str_len) {
-			/* Add new line and then continue */
-			switch (how) {
-			default:
-			case RSPAMD_TASK_NEWLINES_CRLF:
-				out[i++] = '=';
-				out[i++] = '\r';
-				out[i++] = '\n';
-				break;
-			case RSPAMD_TASK_NEWLINES_LF:
-				out[i++] = '=';
-				out[i++] = '\n';
-				break;
-			case RSPAMD_TASK_NEWLINES_CR:
-				out[i++] = '=';
-				out[i++] = '\r';
-				break;
-			}
-
-			span = 0;
 		}
 
 		g_assert (i <= olen);
 		p ++;
 	}
 
+	/* Deal with the last space character */
+	if (seen_spaces > 0) {
+		i --;
+		out[i++] = '=';
+		out[i++] = hexdigests[((last_sp >> 4) & 0xF)];
+		out[i++] = hexdigests[(last_sp & 0xF)];
+	}
+
 	out[i] = '\0';
 
 	if (outlen) {