commit 19b2617: [Rework] Use google-ced instead of libicu chardet as the former sucks

Vsevolod Stakhov vsevolod at highsecure.ru
Tue May 26 10:35:52 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-05-26 11:31:47 +0100
URL: https://github.com/rspamd/rspamd/commit/19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3

[Rework] Use google-ced instead of libicu chardet as the former sucks

---
 CMakeLists.txt                                     |    3 +
 contrib/google-ced/CMakeLists.txt                  |   26 +
 contrib/google-ced/LICENSE                         |  202 +
 contrib/google-ced/ced_c.cc                        |   25 +
 contrib/google-ced/ced_c.h                         |   29 +
 contrib/google-ced/compact_enc_det.cc              | 5719 ++++++++++++++++++
 contrib/google-ced/compact_enc_det.h               |   83 +
 .../google-ced/compact_enc_det_generated_tables.h  | 6326 ++++++++++++++++++++
 .../google-ced/compact_enc_det_generated_tables2.h |  856 +++
 contrib/google-ced/compact_enc_det_hint_code.cc    |  169 +
 contrib/google-ced/compact_enc_det_hint_code.h     |   45 +
 contrib/google-ced/detail_head_string.inc          |  152 +
 contrib/google-ced/util/basictypes.h               |  331 +
 contrib/google-ced/util/case_insensitive_hash.h    |   88 +
 contrib/google-ced/util/commandlineflags.h         |   39 +
 contrib/google-ced/util/encodings/encodings.cc     |  891 +++
 contrib/google-ced/util/encodings/encodings.h      |  299 +
 contrib/google-ced/util/encodings/encodings.pb.h   |  181 +
 .../util/encodings/encodings_unittest.cc           |   34 +
 contrib/google-ced/util/languages/languages.cc     |  349 ++
 contrib/google-ced/util/languages/languages.h      |  381 ++
 contrib/google-ced/util/languages/languages.pb.h   |  191 +
 contrib/google-ced/util/logging.h                  |   25 +
 contrib/google-ced/util/port.h                     |   53 +
 contrib/google-ced/util/string_util.h              |   61 +
 contrib/google-ced/util/varsetter.h                |   66 +
 src/libmime/mime_encoding.c                        |   33 +-
 27 files changed, 16633 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71c5d1761..e60b9cf84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -623,6 +623,8 @@ ADD_SUBDIRECTORY(contrib/t1ha)
 ADD_SUBDIRECTORY(contrib/libev)
 ADD_SUBDIRECTORY(contrib/kann)
 ADD_SUBDIRECTORY(contrib/fastutf8)
+ADD_SUBDIRECTORY(contrib/google-ced)
+
 
 IF (NOT WITH_LUAJIT)
 	ADD_SUBDIRECTORY(contrib/lua-bit)
@@ -643,6 +645,7 @@ LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-actrie)
 LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-t1ha)
 LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-ev)
 LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-kann)
+LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-ced)
 
 IF(ENABLE_CLANG_PLUGIN MATCHES "ON")
 	ADD_SUBDIRECTORY(clang-plugin)
diff --git a/contrib/google-ced/CMakeLists.txt b/contrib/google-ced/CMakeLists.txt
new file mode 100644
index 000000000..e1c1c3edb
--- /dev/null
+++ b/contrib/google-ced/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright 2016 Google Inc.  All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+project(CED CXX)
+set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE INTERNAL "No dev warnings")
+
+option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-narrowing")
+elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-c++11-narrowing")
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+set(CED_LIBRARY_SOURCES
+    compact_enc_det.cc
+    compact_enc_det_hint_code.cc
+    util/encodings/encodings.cc
+    util/languages/languages.cc
+    ced_c.cc
+    )
+
+add_library(rspamd-ced STATIC ${CED_LIBRARY_SOURCES})
diff --git a/contrib/google-ced/LICENSE b/contrib/google-ced/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/contrib/google-ced/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/contrib/google-ced/ced_c.cc b/contrib/google-ced/ced_c.cc
new file mode 100644
index 000000000..d8f70a87e
--- /dev/null
+++ b/contrib/google-ced/ced_c.cc
@@ -0,0 +1,25 @@
+#include "ced_c.h"
+#include "compact_enc_det.h"
+
+const char* ced_encoding_detect(const char* text, int text_length,
+								const char* url_hint,
+								const char* http_charset_hint,
+								const char* meta_charset_hint,
+								const int encoding_hint,
+								CedTextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
+								int* bytes_consumed, bool* is_reliable)
+{
+	CompactEncDet::TextCorpusType ct = CompactEncDet::NUM_CORPA;
+
+	ct = static_cast<CompactEncDet::TextCorpusType>(corpus_type);
+
+	auto enc = CompactEncDet::DetectEncoding(text, text_length, url_hint,
+			http_charset_hint, meta_charset_hint, encoding_hint, default_language(),
+			ct, ignore_7bit_mail_encodings, bytes_consumed, is_reliable);
+
+	if (IsValidEncoding(enc)) {
+		return EncodingName(enc);
+	}
+
+	return nullptr;
+}
diff --git a/contrib/google-ced/ced_c.h b/contrib/google-ced/ced_c.h
new file mode 100644
index 000000000..c8cb16a2a
--- /dev/null
+++ b/contrib/google-ced/ced_c.h
@@ -0,0 +1,29 @@
+#ifndef RSPAMD_CED_C_H
+#define RSPAMD_CED_C_H
+
+#include <stdbool.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+enum CedTextCorpusType {
+	CED_WEB_CORPUS,
+	CED_XML_CORPUS,
+	CED_QUERY_CORPUS,
+	CED_EMAIL_CORPUS,
+	CED_NUM_CORPA,
+};
+
+const char *ced_encoding_detect (const char *text, int text_length,
+								 const char *url_hint,
+								 const char *http_charset_hint,
+								 const char *meta_charset_hint,
+								 const int encoding_hint,
+								 enum CedTextCorpusType corpus_type,
+								 bool ignore_7bit_mail_encodings,
+								 int *bytes_consumed, bool *is_reliable);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/contrib/google-ced/compact_enc_det.cc b/contrib/google-ced/compact_enc_det.cc
new file mode 100644
index 000000000..9f9c3a840
--- /dev/null
+++ b/contrib/google-ced/compact_enc_det.cc
@@ -0,0 +1,5719 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "compact_enc_det.h"
+
+#include <math.h>                       // for sqrt
+#include <stddef.h>                     // for size_t
+#include <stdio.h>                      // for printf, fprintf, NULL, etc
+#include <stdlib.h>                     // for qsort
+#include <string.h>                     // for memset, memcpy, memcmp, etc
+#include <memory>
+#include <string>                       // for string, operator==, etc
+
+#include "compact_enc_det_hint_code.h"
+#include "util/string_util.h"
+#include "util/basictypes.h"
+#include "util/commandlineflags.h"
+#include "util/logging.h"
+
+using std::string;
+
+// TODO as of 2007.10.09:
+//
+// Consider font=TT-BHxxx as user-defined => binary
+// Demote GB18030 if no 8x3x pair
+// Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
+// Consider removing/ignoring bytes 01-1F to avoid crap pollution
+// Possibly boost declared encoding in robust scan
+// googlebot tiny files
+// look for ranges of encodings
+// consider tags just as > < within aligned block of 32
+// flag too few characters in postproc (Latin 6 problem)
+// Remove slow scan beyond 16KB
+// Consider removing kMostLikelyEncoding or cut it in half
+
+
+// A note on mixed encodings
+//
+// The most common encoding error on the web is a page containing a mixture of
+// CP-1252 and UTF-8. A less common encoding error is a third-party feed that
+// has been converted from CP-1252 to UTF-8 and then those bytes converted a
+// second time to UTF-8. CED originally attempted to detect these error cases
+// by using two  synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
+// implementation was to start these just below CP1252 and UTF8 respectively in
+// overall  liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
+// found.
+//
+// The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
+// UTF8CP1252 internal encoding was added late and not put into encodings.proto,
+// so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
+// is removed in this November 2011 CL.
+//
+// Mixed encoding detection never worked out as well as envisioned, so the
+// ced_allow_utf8utf8 flag normally disables all this.
+//
+// The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
+// UTF8, and the inputconverter code for UTF8 normally will convert bare
+// CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
+// and double-UTF-8 mixtures will be detected as UTF-8, and the double
+// conversion will stand.
+//
+// However, it is occasionally useful to use CED to detect double-converted
+// UTF-8 coming from third-party data feeds, so they can be fixed at the source.
+// For this purpose, the  UTF8UTF8 encoding remains available under the
+// ced_allow_utf8utf8 flag.
+//
+// When UTF8UTF8 is detected, the inputconverter code will undo the double
+// conversion, giving good text.
+
+// Norbert Runge has noted these words in CP1252 that are mistakenly identified
+// as UTF-8 because of the last pair of characters:
+//  NESTLÉ®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
+//  drauß\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
+//  Mutterschoß\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
+//  Schoß\u201c           0xDF 0x93 U+00DF U+201C
+//  weiß\u201c            0xDF 0x93 U+00DF U+00AB
+//  Schnellfuß\u201c      0xDF 0x93 U+00DF U+201C
+//  süß«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
+// These four byte combinations now explicitly boost Latin1/CP1252.
+
+// And for reference, here are a couple of Portuguese spellings
+// that may be mistaken as double-byte encodings.
+//   informações          0xE7 0xF5
+//   traição              0xE7 0xE3
+
+
+static const char* kVersion = "2.2";
+
+DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
+                                       "to handle mixtures of CP1252 "
+                                       "converted to UTF-8 zero, one, "
+                                       "or two times");
+DEFINE_int32(enc_detect_slow_max_kb, 16,
+             "Maximum number of Kbytes to examine for "
+             "7-bit-only (2022, Hz, UTF7) encoding detect. "
+             "You are unlikely to want to change this.");
+DEFINE_int32(enc_detect_fast_max_kb, 256,
+             "Maximum number of Kbytes to examine for encoding detect. "
+             "You are unlikely to want to change this.");
+
+DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
+             "difference 1st - 2nd to be considered reliable \n"
+             "  2 corresponds to min 4x difference\n"
+             "  4 corresponds to min 16x difference\n"
+             "  8 corresponds to min 256x difference\n"
+             "  10 corresponds to min 1024x difference\n"
+             "  20 corresponds to min 1Mx difference.");
+
+// Text debug output options
+DEFINE_bool(enc_detect_summary, false,
+            "Print first 16 interesting pairs at exit.");
+DEFINE_bool(counts, false, "Count major-section usage");
+
+// PostScript debug output options
+DEFINE_bool(enc_detect_detail, false,
+             "Print PostScript of every update, to stderr.");
+DEFINE_bool(enc_detect_detail2, false,
+             "More PostScript detail of every update, to stderr.");
+DEFINE_bool(enc_detect_source, false, "Include source text in detail");
+// Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
+// lang_enc.cc
+
+// Following flags are not in use. Replace them with constants to
+// avoid static initialization.
+
+//DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
+//DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
+
+static const char* const FLAGS_enc_detect_watch1 = "";
+static const char* const FLAGS_enc_detect_watch2 = "";
+
+// Only for experiments. Delete soon.
+DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
+
+// Demo-mode/debugging experiment
+DEFINE_bool(demo_nodefault, false,
+             "Default to all equal; no boost for declared encoding.");
+DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
+DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
+
+
+static const int XDECILOG2 = 3;             // Multiplier for log base 2 ** n/10
+static const int XLOG2 = 30;                // Multiplier for log base 2 ** n
+
+static const int kFinalPruneDifference = 10 * XLOG2;
+                                            // Final bits of minimum
+                                            // probability difference 1st-nth
+                                            // to be pruned
+
+static const int kInititalPruneDifference = kFinalPruneDifference * 4;
+                                            // Initial bits of minimum
+                                            // probability difference 1st-nth
+                                            // to be pruned
+                                            //
+static const int kPruneDiffDecrement = kFinalPruneDifference;
+                                            // Decrements bits of minimum
+                                            // probability difference 1st-nth
+                                            // to be pruned
+
+static const int kSmallInitDiff = 2 * XLOG2;       // bits of minimum
+                                            // probability difference, base to
+                                            // superset encodings
+
+static const int kBoostInitial = 20 * XLOG2;    // bits of boost for
+                                            // initial byte patterns (BOM, 00)
+
+static const int kBadPairWhack = 20 * XLOG2;    // bits of whack for
+                                            // one bad pair
+
+static const int kBoostOnePair = 20 * XLOG2;    // bits of boost for
+                                            // one good pair in Hz, etc.
+
+static const int kGentleOnePair = 4 * XLOG2;    // bits of boost for
+                                            // one good sequence
+                                            //
+static const int kGentlePairWhack = 2 * XLOG2;       // bits of whack
+                                            // for ill-formed sequence
+
+static const int kGentlePairBoost = 2 * XLOG2;       // bits of boost
+                                            // for well-formed sequence
+
+static const int kDeclaredEncBoost = 5 * XDECILOG2;  // bits/10 of boost for
+                                            // best declared encoding per bigram
+
+static const int kBestEncBoost = 5 * XDECILOG2;     // bits/10 of boost for
+                                            // best encoding per bigram
+
+static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
+
+static const int kMaxPairs = 48;            // Max interesting pairs to look at
+                                            // If you change this,
+                                            // adjust *PruneDiff*
+
+static const int kPruneMask = 0x07;         // Prune every 8 interesting pairs
+
+
+static const int kBestPairsCount = 16;      // For first N pairs, do extra boost
+                                            // based on most likely encoding
+                                            // of pair over entire web
+
+static const int kDerateHintsBelow = 12;    // If we have fewer than N bigrams,
+                                            // weaken the hints enough that
+                                            // unhinted encodings have a hope of
+                                            // rising to the top
+
+static const int kMinRescanLength = 800;    // Don't bother rescanning for
+                                            // unreliable encoding if fewer
+                                            // than this many bytes unscanned.
+                                            // We will rescan at most last half
+                                            // of this.
+
+static const int kStrongBinary = 12;  // Make F_BINARY the only encoding
+static const int kWeakerBinary = 4;   // Make F_BINARY likely encoding
+
+// These are byte counts from front of file
+static const int kBinaryHardAsciiLimit = 6 * 1024;  // Not binary if all ASCII
+static const int kBinarySoftAsciiLimit = 8 * 1024;  //   "   if mostly ASCII
+
+// We try here to avoid having title text dominate the encoding detection,
+// for the not-infrequent error case of title in encoding1, body in encoding2:
+// we want to bias toward encoding2 winning.
+//
+// kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
+// rarely cut off mid-character in the original (not-yet-detected) encoding.
+// This matters most for UTF-8 two- and three-byte codes and for
+// Shift-JIS three-byte codes.
+static const int kMaxBigramsTagTitleText = 12;      // Keep only some tag text
+static const int kWeightshiftForTagTitleText = 4;   // Give text in tags, etc.
+                                                    // 1/16 normal weight
+
+static const int kStrongPairs = 6;          // Let reliable enc with this many
+                                            // pairs overcome missing hint
+
+enum CEDInternalFlags {
+  kCEDNone = 0,           // The empty flag
+  kCEDRescanning = 1,     // Do not further recurse
+  kCEDSlowscore = 2,      // Do extra scoring
+  kCEDForceTags = 4,      // Always examine text inside tags
+};
+
+// Forward declaration
+Encoding InternalDetectEncoding(
+    CEDInternalFlags flags, const char* text, int text_length,
+    const char* url_hint, const char* http_charset_hint,
+    const char* meta_charset_hint, const int encoding_hint,
+    const Language language_hint,  // User interface lang
+    const CompactEncDet::TextCorpusType corpus_type,
+    bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
+    Encoding* second_best_enc);
+
+typedef struct {
+  const uint8* hires[4];  // Pointers to possible high-resolution bigram deltas
+  uint8 x_bar;          // Average byte2 value
+  uint8 y_bar;          // Average byte1 value
+  uint8 x_stddev;       // Standard deviation of byte2 value
+  uint8 y_stddev;       // Standard deviation of byte1 value
+  int so;               // Scaling offset -- add to probabilities below
+  uint8 b1[256];        // Unigram probability for first byte of aligned bigram
+  uint8 b2[256];        // Unigram probability for second byte of aligned bigram
+  uint8 b12[256];       // Unigram probability for cross bytes of aligned bigram
+} UnigramEntry;
+
+//typedef struct {
+//  uint8 b12[256*256]; // Bigram probability for aligned bigram
+//} FullBigramEntry;
+
+
+// Include all the postproc-generated tables here:
+// RankedEncoding
+// kMapToEncoding
+// unigram_table
+// kMostLIkelyEncoding
+// kTLDHintProbs
+// kCharsetHintProbs
+// HintEntry, kMaxTldKey kMaxTldVector, etc.
+// =============================================================================
+
+#include "compact_enc_det_generated_tables.h"
+
+
+#define F_ASCII F_Latin1    // "ASCII" is a misnomer, so this code uses "Latin1"
+
+#define F_BINARY F_X_BINARYENC        // We are mid-update for name change
+#define F_UTF8UTF8 F_X_UTF8UTF8       // We are mid-update for name change
+#define F_BIG5_CP950 F_BIG5           // We are mid-update for name change
+#define F_Unicode F_UTF_16LE          // We are mid-update for name change
+// =============================================================================
+
+// 7-bit encodings have at least one "interesting" byte value < 0x80
+//   (00 0E 1B + ~)
+// JIS 2022-cn 2022-kr hz utf7
+// Unicode UTF-16 UTF-32
+// 8-bit encodings have no interesting byte values < 0x80
+static const uint32 kSevenBitActive = 0x00000001;   // needs <80 to detect
+static const uint32 kUTF7Active     = 0x00000002;   // <80 and +
+static const uint32 kHzActive       = 0x00000004;   // <80 and ~
+static const uint32 kIso2022Active  = 0x00000008;   // <80 and 1B 0E 0F
+static const uint32 kUTF8Active     = 0x00000010;
+static const uint32 kUTF8UTF8Active = 0x00000020;
+static const uint32 kUTF1632Active  = 0x00000040;   // <80 and 00
+static const uint32 kBinaryActive   = 0x00000080;   // <80 and 00
+static const uint32 kTwobyteCode    = 0x00000100;   // Needs 8xxx
+static const uint32 kIsIndicCode    = 0x00000200;   //
+static const uint32 kHighAlphaCode  = 0x00000400;   // full alphabet in 8x-Fx
+static const uint32 kHighAccentCode = 0x00000800;   // accents in 8x-Fx
+static const uint32 kEUCJPActive    = 0x00001000;   // Have to mess with phase
+
+
+// Debug only. not thread safe
+static int encdet_used = 0;
+static int rescore_used = 0;
+static int rescan_used = 0;
+static int robust_used = 0;
+static int looking_used = 0;
+static int doing_used = 0;
+
+
+// For debugging only -- about 256B/entry times about 500 = 128KB
*** OUTPUT TRUNCATED, 16187 LINES SKIPPED ***


More information about the Commits mailing list