commit 19b2617: [Rework] Use google-ced instead of libicu chardet as the former sucks
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue May 26 10:35:52 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-05-26 11:31:47 +0100
URL: https://github.com/rspamd/rspamd/commit/19b2617fa8e2e8ec7c5181a6a2c97aa7908886c3
[Rework] Use google-ced instead of libicu chardet as the former sucks
---
CMakeLists.txt | 3 +
contrib/google-ced/CMakeLists.txt | 26 +
contrib/google-ced/LICENSE | 202 +
contrib/google-ced/ced_c.cc | 25 +
contrib/google-ced/ced_c.h | 29 +
contrib/google-ced/compact_enc_det.cc | 5719 ++++++++++++++++++
contrib/google-ced/compact_enc_det.h | 83 +
.../google-ced/compact_enc_det_generated_tables.h | 6326 ++++++++++++++++++++
.../google-ced/compact_enc_det_generated_tables2.h | 856 +++
contrib/google-ced/compact_enc_det_hint_code.cc | 169 +
contrib/google-ced/compact_enc_det_hint_code.h | 45 +
contrib/google-ced/detail_head_string.inc | 152 +
contrib/google-ced/util/basictypes.h | 331 +
contrib/google-ced/util/case_insensitive_hash.h | 88 +
contrib/google-ced/util/commandlineflags.h | 39 +
contrib/google-ced/util/encodings/encodings.cc | 891 +++
contrib/google-ced/util/encodings/encodings.h | 299 +
contrib/google-ced/util/encodings/encodings.pb.h | 181 +
.../util/encodings/encodings_unittest.cc | 34 +
contrib/google-ced/util/languages/languages.cc | 349 ++
contrib/google-ced/util/languages/languages.h | 381 ++
contrib/google-ced/util/languages/languages.pb.h | 191 +
contrib/google-ced/util/logging.h | 25 +
contrib/google-ced/util/port.h | 53 +
contrib/google-ced/util/string_util.h | 61 +
contrib/google-ced/util/varsetter.h | 66 +
src/libmime/mime_encoding.c | 33 +-
27 files changed, 16633 insertions(+), 24 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71c5d1761..e60b9cf84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -623,6 +623,8 @@ ADD_SUBDIRECTORY(contrib/t1ha)
ADD_SUBDIRECTORY(contrib/libev)
ADD_SUBDIRECTORY(contrib/kann)
ADD_SUBDIRECTORY(contrib/fastutf8)
+ADD_SUBDIRECTORY(contrib/google-ced)
+
IF (NOT WITH_LUAJIT)
ADD_SUBDIRECTORY(contrib/lua-bit)
@@ -643,6 +645,7 @@ LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-actrie)
LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-t1ha)
LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-ev)
LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-kann)
+LIST(APPEND RSPAMD_REQUIRED_LIBRARIES rspamd-ced)
IF(ENABLE_CLANG_PLUGIN MATCHES "ON")
ADD_SUBDIRECTORY(clang-plugin)
diff --git a/contrib/google-ced/CMakeLists.txt b/contrib/google-ced/CMakeLists.txt
new file mode 100644
index 000000000..e1c1c3edb
--- /dev/null
+++ b/contrib/google-ced/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+project(CED CXX)
+set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE INTERNAL "No dev warnings")
+
+option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-narrowing")
+elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-c++11-narrowing")
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+set(CED_LIBRARY_SOURCES
+ compact_enc_det.cc
+ compact_enc_det_hint_code.cc
+ util/encodings/encodings.cc
+ util/languages/languages.cc
+ ced_c.cc
+ )
+
+add_library(rspamd-ced STATIC ${CED_LIBRARY_SOURCES})
diff --git a/contrib/google-ced/LICENSE b/contrib/google-ced/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/contrib/google-ced/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/contrib/google-ced/ced_c.cc b/contrib/google-ced/ced_c.cc
new file mode 100644
index 000000000..d8f70a87e
--- /dev/null
+++ b/contrib/google-ced/ced_c.cc
@@ -0,0 +1,25 @@
+#include "ced_c.h"
+#include "compact_enc_det.h"
+
+const char* ced_encoding_detect(const char* text, int text_length,
+ const char* url_hint,
+ const char* http_charset_hint,
+ const char* meta_charset_hint,
+ const int encoding_hint,
+ CedTextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
+ int* bytes_consumed, bool* is_reliable)
+{
+ CompactEncDet::TextCorpusType ct = CompactEncDet::NUM_CORPA;
+
+ ct = static_cast<CompactEncDet::TextCorpusType>(corpus_type);
+
+ auto enc = CompactEncDet::DetectEncoding(text, text_length, url_hint,
+ http_charset_hint, meta_charset_hint, encoding_hint, default_language(),
+ ct, ignore_7bit_mail_encodings, bytes_consumed, is_reliable);
+
+ if (IsValidEncoding(enc)) {
+ return EncodingName(enc);
+ }
+
+ return nullptr;
+}
diff --git a/contrib/google-ced/ced_c.h b/contrib/google-ced/ced_c.h
new file mode 100644
index 000000000..c8cb16a2a
--- /dev/null
+++ b/contrib/google-ced/ced_c.h
@@ -0,0 +1,29 @@
+#ifndef RSPAMD_CED_C_H
+#define RSPAMD_CED_C_H
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+enum CedTextCorpusType {
+ CED_WEB_CORPUS,
+ CED_XML_CORPUS,
+ CED_QUERY_CORPUS,
+ CED_EMAIL_CORPUS,
+ CED_NUM_CORPA,
+};
+
+const char *ced_encoding_detect (const char *text, int text_length,
+ const char *url_hint,
+ const char *http_charset_hint,
+ const char *meta_charset_hint,
+ const int encoding_hint,
+ enum CedTextCorpusType corpus_type,
+ bool ignore_7bit_mail_encodings,
+ int *bytes_consumed, bool *is_reliable);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/contrib/google-ced/compact_enc_det.cc b/contrib/google-ced/compact_enc_det.cc
new file mode 100644
index 000000000..9f9c3a840
--- /dev/null
+++ b/contrib/google-ced/compact_enc_det.cc
@@ -0,0 +1,5719 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "compact_enc_det.h"
+
+#include <math.h> // for sqrt
+#include <stddef.h> // for size_t
+#include <stdio.h> // for printf, fprintf, NULL, etc
+#include <stdlib.h> // for qsort
+#include <string.h> // for memset, memcpy, memcmp, etc
+#include <memory>
+#include <string> // for string, operator==, etc
+
+#include "compact_enc_det_hint_code.h"
+#include "util/string_util.h"
+#include "util/basictypes.h"
+#include "util/commandlineflags.h"
+#include "util/logging.h"
+
+using std::string;
+
+// TODO as of 2007.10.09:
+//
+// Consider font=TT-BHxxx as user-defined => binary
+// Demote GB18030 if no 8x3x pair
+// Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
+// Consider removing/ignoring bytes 01-1F to avoid crap pollution
+// Possibly boost declared encoding in robust scan
+// googlebot tiny files
+// look for ranges of encodings
+// consider tags just as > < within aligned block of 32
+// flag too few characters in postproc (Latin 6 problem)
+// Remove slow scan beyond 16KB
+// Consider removing kMostLikelyEncoding or cut it in half
+
+
+// A note on mixed encodings
+//
+// The most common encoding error on the web is a page containing a mixture of
+// CP-1252 and UTF-8. A less common encoding error is a third-party feed that
+// has been converted from CP-1252 to UTF-8 and then those bytes converted a
+// second time to UTF-8. CED originally attempted to detect these error cases
+// by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
+// implementation was to start these just below CP1252 and UTF8 respectively in
+// overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
+// found.
+//
+// The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
+// UTF8CP1252 internal encoding was added late and not put into encodings.proto,
+// so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
+// is removed in this November 2011 CL.
+//
+// Mixed encoding detection never worked out as well as envisioned, so the
+// ced_allow_utf8utf8 flag normally disables all this.
+//
+// The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
+// UTF8, and the inputconverter code for UTF8 normally will convert bare
+// CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
+// and double-UTF-8 mixtures will be detected as UTF-8, and the double
+// conversion will stand.
+//
+// However, it is occasionally useful to use CED to detect double-converted
+// UTF-8 coming from third-party data feeds, so they can be fixed at the source.
+// For this purpose, the UTF8UTF8 encoding remains available under the
+// ced_allow_utf8utf8 flag.
+//
+// When UTF8UTF8 is detected, the inputconverter code will undo the double
+// conversion, giving good text.
+
+// Norbert Runge has noted these words in CP1252 that are mistakenly identified
+// as UTF-8 because of the last pair of characters:
+// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
+// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
+// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
+// Schoß\u201c 0xDF 0x93 U+00DF U+201C
+// weiß\u201c 0xDF 0x93 U+00DF U+00AB
+// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
+// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
+// These four byte combinations now explicitly boost Latin1/CP1252.
+
+// And for reference, here are a couple of Portuguese spellings
+// that may be mistaken as double-byte encodings.
+// informações 0xE7 0xF5
+// traição 0xE7 0xE3
+
+
+static const char* kVersion = "2.2";
+
+DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
+ "to handle mixtures of CP1252 "
+ "converted to UTF-8 zero, one, "
+ "or two times");
+DEFINE_int32(enc_detect_slow_max_kb, 16,
+ "Maximum number of Kbytes to examine for "
+ "7-bit-only (2022, Hz, UTF7) encoding detect. "
+ "You are unlikely to want to change this.");
+DEFINE_int32(enc_detect_fast_max_kb, 256,
+ "Maximum number of Kbytes to examine for encoding detect. "
+ "You are unlikely to want to change this.");
+
+DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
+ "difference 1st - 2nd to be considered reliable \n"
+ " 2 corresponds to min 4x difference\n"
+ " 4 corresponds to min 16x difference\n"
+ " 8 corresponds to min 256x difference\n"
+ " 10 corresponds to min 1024x difference\n"
+ " 20 corresponds to min 1Mx difference.");
+
+// Text debug output options
+DEFINE_bool(enc_detect_summary, false,
+ "Print first 16 interesting pairs at exit.");
+DEFINE_bool(counts, false, "Count major-section usage");
+
+// PostScript debug output options
+DEFINE_bool(enc_detect_detail, false,
+ "Print PostScript of every update, to stderr.");
+DEFINE_bool(enc_detect_detail2, false,
+ "More PostScript detail of every update, to stderr.");
+DEFINE_bool(enc_detect_source, false, "Include source text in detail");
+// Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
+// lang_enc.cc
+
+// Following flags are not in use. Replace them with constants to
+// avoid static initialization.
+
+//DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
+//DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
+
+static const char* const FLAGS_enc_detect_watch1 = "";
+static const char* const FLAGS_enc_detect_watch2 = "";
+
+// Only for experiments. Delete soon.
+DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
+
+// Demo-mode/debugging experiment
+DEFINE_bool(demo_nodefault, false,
+ "Default to all equal; no boost for declared encoding.");
+DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
+DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
+
+
+static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10
+static const int XLOG2 = 30; // Multiplier for log base 2 ** n
+
+static const int kFinalPruneDifference = 10 * XLOG2;
+ // Final bits of minimum
+ // probability difference 1st-nth
+ // to be pruned
+
+static const int kInititalPruneDifference = kFinalPruneDifference * 4;
+ // Initial bits of minimum
+ // probability difference 1st-nth
+ // to be pruned
+ //
+static const int kPruneDiffDecrement = kFinalPruneDifference;
+ // Decrements bits of minimum
+ // probability difference 1st-nth
+ // to be pruned
+
+static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum
+ // probability difference, base to
+ // superset encodings
+
+static const int kBoostInitial = 20 * XLOG2; // bits of boost for
+ // initial byte patterns (BOM, 00)
+
+static const int kBadPairWhack = 20 * XLOG2; // bits of whack for
+ // one bad pair
+
+static const int kBoostOnePair = 20 * XLOG2; // bits of boost for
+ // one good pair in Hz, etc.
+
+static const int kGentleOnePair = 4 * XLOG2; // bits of boost for
+ // one good sequence
+ //
+static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
+ // for ill-formed sequence
+
+static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
+ // for well-formed sequence
+
+static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
+ // best declared encoding per bigram
+
+static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for
+ // best encoding per bigram
+
+static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
+
+static const int kMaxPairs = 48; // Max interesting pairs to look at
+ // If you change this,
+ // adjust *PruneDiff*
+
+static const int kPruneMask = 0x07; // Prune every 8 interesting pairs
+
+
+static const int kBestPairsCount = 16; // For first N pairs, do extra boost
+ // based on most likely encoding
+ // of pair over entire web
+
+static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,
+ // weaken the hints enough that
+ // unhinted encodings have a hope of
+ // rising to the top
+
+static const int kMinRescanLength = 800; // Don't bother rescanning for
+ // unreliable encoding if fewer
+ // than this many bytes unscanned.
+ // We will rescan at most last half
+ // of this.
+
+static const int kStrongBinary = 12; // Make F_BINARY the only encoding
+static const int kWeakerBinary = 4; // Make F_BINARY likely encoding
+
+// These are byte counts from front of file
+static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII
+static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII
+
+// We try here to avoid having title text dominate the encoding detection,
+// for the not-infrequent error case of title in encoding1, body in encoding2:
+// we want to bias toward encoding2 winning.
+//
+// kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
+// rarely cut off mid-character in the original (not-yet-detected) encoding.
+// This matters most for UTF-8 two- and three-byte codes and for
+// Shift-JIS three-byte codes.
+static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text
+static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.
+ // 1/16 normal weight
+
+static const int kStrongPairs = 6; // Let reliable enc with this many
+ // pairs overcome missing hint
+
+enum CEDInternalFlags {
+ kCEDNone = 0, // The empty flag
+ kCEDRescanning = 1, // Do not further recurse
+ kCEDSlowscore = 2, // Do extra scoring
+ kCEDForceTags = 4, // Always examine text inside tags
+};
+
+// Forward declaration
+Encoding InternalDetectEncoding(
+ CEDInternalFlags flags, const char* text, int text_length,
+ const char* url_hint, const char* http_charset_hint,
+ const char* meta_charset_hint, const int encoding_hint,
+ const Language language_hint, // User interface lang
+ const CompactEncDet::TextCorpusType corpus_type,
+ bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
+ Encoding* second_best_enc);
+
+typedef struct {
+ const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas
+ uint8 x_bar; // Average byte2 value
+ uint8 y_bar; // Average byte1 value
+ uint8 x_stddev; // Standard deviation of byte2 value
+ uint8 y_stddev; // Standard deviation of byte1 value
+ int so; // Scaling offset -- add to probabilities below
+ uint8 b1[256]; // Unigram probability for first byte of aligned bigram
+ uint8 b2[256]; // Unigram probability for second byte of aligned bigram
+ uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram
+} UnigramEntry;
+
+//typedef struct {
+// uint8 b12[256*256]; // Bigram probability for aligned bigram
+//} FullBigramEntry;
+
+
+// Include all the postproc-generated tables here:
+// RankedEncoding
+// kMapToEncoding
+// unigram_table
+// kMostLIkelyEncoding
+// kTLDHintProbs
+// kCharsetHintProbs
+// HintEntry, kMaxTldKey kMaxTldVector, etc.
+// =============================================================================
+
+#include "compact_enc_det_generated_tables.h"
+
+
+#define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"
+
+#define F_BINARY F_X_BINARYENC // We are mid-update for name change
+#define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change
+#define F_BIG5_CP950 F_BIG5 // We are mid-update for name change
+#define F_Unicode F_UTF_16LE // We are mid-update for name change
+// =============================================================================
+
+// 7-bit encodings have at least one "interesting" byte value < 0x80
+// (00 0E 1B + ~)
+// JIS 2022-cn 2022-kr hz utf7
+// Unicode UTF-16 UTF-32
+// 8-bit encodings have no interesting byte values < 0x80
+static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect
+static const uint32 kUTF7Active = 0x00000002; // <80 and +
+static const uint32 kHzActive = 0x00000004; // <80 and ~
+static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F
+static const uint32 kUTF8Active = 0x00000010;
+static const uint32 kUTF8UTF8Active = 0x00000020;
+static const uint32 kUTF1632Active = 0x00000040; // <80 and 00
+static const uint32 kBinaryActive = 0x00000080; // <80 and 00
+static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx
+static const uint32 kIsIndicCode = 0x00000200; //
+static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx
+static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx
+static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase
+
+
+// Debug only. not thread safe
+static int encdet_used = 0;
+static int rescore_used = 0;
+static int rescan_used = 0;
+static int robust_used = 0;
+static int looking_used = 0;
+static int doing_used = 0;
+
+
+// For debugging only -- about 256B/entry times about 500 = 128KB
*** OUTPUT TRUNCATED, 16187 LINES SKIPPED ***
More information about the Commits
mailing list