commit b879952: [Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Vsevolod Stakhov
vsevolod at highsecure.ru
Tue Feb 25 09:56:09 UTC 2020
Author: Vsevolod Stakhov
Date: 2020-02-25 09:55:31 +0000
URL: https://github.com/rspamd/rspamd/commit/b87995255fa2ef0de97d509b8cd27860f014e90f (HEAD -> master)
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
---
contrib/snowball/CMakeLists.txt | 35 +-
contrib/snowball/GNUmakefile | 300 ---
contrib/snowball/NEWS | 407 ++++
contrib/snowball/algorithms/arabic.sbl | 561 +++++
contrib/snowball/algorithms/basque.sbl | 149 ++
contrib/snowball/algorithms/catalan.sbl | 202 ++
.../{danish/stem_ISO_8859_1.sbl => danish.sbl} | 14 +-
.../algorithms/danish/stem_MS_DOS_Latin_I.sbl | 91 -
.../{dutch/stem_ISO_8859_1.sbl => dutch.sbl} | 24 +-
.../algorithms/dutch/stem_MS_DOS_Latin_I.sbl | 164 --
.../{english/stem_ISO_8859_1.sbl => english.sbl} | 0
.../{finnish/stem_ISO_8859_1.sbl => finnish.sbl} | 15 +-
.../{french/stem_ISO_8859_1.sbl => french.sbl} | 42 +-
.../algorithms/french/stem_MS_DOS_Latin_I.sbl | 239 --
.../{german/stem_ISO_8859_1.sbl => german.sbl} | 10 +-
.../algorithms/german/stem_MS_DOS_Latin_I.sbl | 139 --
.../{german2/stem_ISO_8859_1.sbl => german2.sbl} | 10 +-
contrib/snowball/algorithms/greek.sbl | 706 ++++++
contrib/snowball/algorithms/hindi.sbl | 323 +++
.../{hungarian/stem_Unicode.sbl => hungarian.sbl} | 22 +-
.../algorithms/hungarian/stem_ISO_8859_2.sbl | 241 --
contrib/snowball/algorithms/indonesian.sbl | 192 ++
contrib/snowball/algorithms/irish.sbl | 151 ++
.../stem_MS_DOS_Latin_I.sbl => italian.sbl} | 24 +-
.../algorithms/italian/stem_ISO_8859_1.sbl | 195 --
.../stem_ISO_8859_1.sbl => kraaij_pohlmann.sbl} | 31 +-
contrib/snowball/algorithms/lithuanian.sbl | 373 +++
.../{lovins/stem_ISO_8859_1.sbl => lovins.sbl} | 0
contrib/snowball/algorithms/nepali.sbl | 92 +
.../stem_ISO_8859_1.sbl => norwegian.sbl} | 8 +-
.../algorithms/norwegian/stem_MS_DOS_Latin_I.sbl | 80 -
.../{porter/stem_ISO_8859_1.sbl => porter.sbl} | 0
.../stem_ISO_8859_1.sbl => portuguese.sbl} | 30 +-
.../algorithms/portuguese/stem_MS_DOS_Latin_I.sbl | 218 --
.../{romanian/stem_ISO_8859_2.sbl => romanian.sbl} | 10 +-
.../snowball/algorithms/romanian/stem_Unicode.sbl | 236 --
.../{russian/stem_KOI8_R.sbl => russian.sbl} | 76 +-
.../snowball/algorithms/russian/stem_Unicode.sbl | 215 --
contrib/snowball/algorithms/serbian.sbl | 2378 ++++++++++++++++++++
.../{spanish/stem_ISO_8859_1.sbl => spanish.sbl} | 18 +-
.../algorithms/spanish/stem_MS_DOS_Latin_I.sbl | 230 --
.../{swedish/stem_ISO_8859_1.sbl => swedish.sbl} | 8 +-
.../algorithms/swedish/stem_MS_DOS_Latin_I.sbl | 72 -
contrib/snowball/algorithms/tamil.sbl | 405 ++++
.../{turkish/stem_Unicode.sbl => turkish.sbl} | 201 +-
contrib/snowball/charsets/ISO-8859-2.sbl | 98 +
contrib/snowball/charsets/KOI8-R.sbl | 74 +
contrib/snowball/charsets/cp850.sbl | 130 ++
contrib/snowball/compiler/analyser.c | 866 +++++--
contrib/snowball/compiler/driver.c | 466 +++-
contrib/snowball/compiler/generator.c | 1158 ++++++----
contrib/snowball/compiler/generator_java.c | 1452 ------------
contrib/snowball/compiler/header.h | 215 +-
contrib/snowball/compiler/space.c | 72 +-
contrib/snowball/compiler/syswords.h | 6 +-
contrib/snowball/compiler/syswords2.h | 4 +-
contrib/snowball/compiler/tokeniser.c | 407 ++--
contrib/snowball/examples/stemwords.c | 209 --
contrib/snowball/include/libstemmer.h | 9 +-
.../snowball/java/org/tartarus/snowball/Among.java | 31 -
.../org/tartarus/snowball/SnowballProgram.java | 432 ----
.../org/tartarus/snowball/SnowballStemmer.java | 7 -
.../java/org/tartarus/snowball/TestApp.java | 77 -
contrib/snowball/libstemmer/libstemmer_c.in | 15 +-
contrib/snowball/libstemmer/mkmodules.pl | 25 +-
contrib/snowball/libstemmer/modules.txt | 44 +-
contrib/snowball/runtime/api.c | 10 +-
contrib/snowball/runtime/api.h | 10 +-
contrib/snowball/runtime/header.h | 3 +-
contrib/snowball/runtime/utilities.c | 191 +-
70 files changed, 8903 insertions(+), 6045 deletions(-)
diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt
index 7910c7b3a..015f75d1b 100644
--- a/contrib/snowball/CMakeLists.txt
+++ b/contrib/snowball/CMakeLists.txt
@@ -1,20 +1,14 @@
# End of configuration
-SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian
- italian norwegian porter portuguese romanian
- russian spanish swedish turkish)
-SET(KOI8_ALGORITHMS russian)
-SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian
- norwegian porter portuguese spanish swedish)
-SET(ISO_8859_2_ALGORITHMS hungarian romanian)
-SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins)
-SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS})
+SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian
+ indonesian italian lithuanian nepali norwegian porter portuguese romanian
+ russian serbian spanish swedish tamil turkish)
+SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS})
SET(COMPILER_SOURCES compiler/space.c
compiler/tokeniser.c
compiler/analyser.c
compiler/generator.c
- compiler/driver.c
- compiler/generator_java.c)
+ compiler/driver.c)
SET(SNOWBALL_RUNTIME runtime/api.c
runtime/utilities.c)
@@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c)
#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
-SET(STEMWORDS_SOURCES examples/stemwords.c)
SET(MODULES_H "modules.h")
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY)
+ADD_DEFINITIONS("-DDISABLE_JS")
+ADD_DEFINITIONS("-DDISABLE_GO")
+ADD_DEFINITIONS("-DDISABLE_JAVA")
+ADD_DEFINITIONS("-DDISABLE_PYTHON")
+ADD_DEFINITIONS("-DDISABLE_CSHARP")
+ADD_DEFINITIONS("-DDISABLE_PASCAL")
+ADD_DEFINITIONS("-DDISABLE_RUST")
MACRO(gen_stem IN ENCODING)
FOREACH(_it ${IN})
@@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING)
SET(_header "${_base}.h")
SET(_source "${_base}.c")
STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}")
- SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl")
+ SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl")
IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input})
ADD_CUSTOM_COMMAND(OUTPUT ${_source}
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u
@@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include")
ADD_EXECUTABLE(snowball ${COMPILER_SOURCES})
ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h
- COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak)
+ COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak)
ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h")
SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c")
@@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL)
ADD_DEPENDENCIES(stemmer_deps modules)
gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8")
-gen_stem("${KOI8_ALGORITHMS}" "KOI8_R")
-gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1")
-gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2")
-
ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES})
ADD_DEPENDENCIES(stemmer stemmer_deps)
-
-ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES})
-TARGET_LINK_LIBRARIES(stemwords stemmer)
diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile
deleted file mode 100644
index a30cafd89..000000000
--- a/contrib/snowball/GNUmakefile
+++ /dev/null
@@ -1,300 +0,0 @@
-# -*- makefile -*-
-
-c_src_dir = src_c
-java_src_main_dir = java/org/tartarus/snowball
-java_src_dir = $(java_src_main_dir)/ext
-
-libstemmer_algorithms = danish dutch english finnish french german hungarian \
- italian \
- norwegian porter portuguese romanian \
- russian spanish swedish turkish
-
-KOI8_R_algorithms = russian
-ISO_8859_1_algorithms = danish dutch english finnish french german italian \
- norwegian porter portuguese spanish swedish
-ISO_8859_2_algorithms = hungarian romanian
-
-other_algorithms = german2 kraaij_pohlmann lovins
-
-all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
-
-COMPILER_SOURCES = compiler/space.c \
- compiler/tokeniser.c \
- compiler/analyser.c \
- compiler/generator.c \
- compiler/driver.c \
- compiler/generator_java.c
-COMPILER_HEADERS = compiler/header.h \
- compiler/syswords.h \
- compiler/syswords2.h
-
-RUNTIME_SOURCES = runtime/api.c \
- runtime/utilities.c
-RUNTIME_HEADERS = runtime/api.h \
- runtime/header.h
-
-JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
- java/org/tartarus/snowball/SnowballProgram.java \
- java/org/tartarus/snowball/SnowballStemmer.java \
- java/org/tartarus/snowball/TestApp.java
-
-LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
-LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
-LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
-LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
-
-STEMWORDS_SOURCES = examples/stemwords.c
-
-ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
-C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
- $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
- $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \
- $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c)
-C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
- $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \
- $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \
- $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h)
-C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
-C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
-JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
-
-COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
-RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
-LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o)
-LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o)
-STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o)
-C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o)
-C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
-JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
-JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)
-
-CFLAGS=-Iinclude -O2
-CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations
-
-all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS)
-
-clean:
- rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \
- $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \
- libstemmer.o stemwords \
- libstemmer/modules.h \
- libstemmer/modules_utf8.h \
- snowball.splint \
- $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
- $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
- $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
- libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
- libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
- rm -rf dist
- rmdir $(c_src_dir) || true
-
-snowball: $(COMPILER_OBJECTS)
- $(CC) -o $@ $^
-
-$(COMPILER_OBJECTS): $(COMPILER_HEADERS)
-
-libstemmer/libstemmer.c: libstemmer/libstemmer_c.in
- sed 's/@MODULES_H@/modules.h/' $^ >$@
-
-libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in
- sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@
-
-libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt
- libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak
-
-libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt
- libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8
-
-libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS)
-
-libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
- $(AR) -cru $@ $^
-
-stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
- $(CC) -o $@ $^
-
-algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
- cp $^ $@
-
-$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_UTF_8_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
- ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u
-
-$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime
-
-$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
- $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
-
-$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
- @mkdir -p $(java_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
- o="$(java_src_dir)/$${l}Stemmer"; \
- echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
- ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
-
-splint: snowball.splint
-snowball.splint: $(COMPILER_SOURCES)
- splint $^ >$@ -weak
-
-# Make a full source distribution
-dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
-
-# Make a distribution of all the sources involved in snowball
-dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
- $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_SOURCES) \
- $(LIBSTEMMER_UTF8_SOURCES) \
- $(LIBSTEMMER_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \
- GNUmakefile README doc/TODO libstemmer/mkmodules.pl
- destname=snowball_code; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- for file in $^; do \
- dir=`dirname $$file` && \
- mkdir -p $${dest}/$${dir} && \
- cp -a $${file} $${dest}/$${dir} || exit 1 ; \
- done && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the C library.
-dist_libstemmer_c: \
- $(RUNTIME_SOURCES) \
- $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_SOURCES) \
- $(LIBSTEMMER_UTF8_SOURCES) \
- $(LIBSTEMMER_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(C_LIB_SOURCES) \
- $(C_LIB_HEADERS) \
- libstemmer/mkinc.mak \
- libstemmer/mkinc_utf8.mak
- destname=libstemmer_c; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- mkdir -p $${dest} && \
- cp -a doc/libstemmer_c_README $${dest}/README && \
- mkdir -p $${dest}/examples && \
- cp -a examples/stemwords.c $${dest}/examples && \
- mkdir -p $${dest}/$(c_src_dir) && \
- cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \
- mkdir -p $${dest}/runtime && \
- cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \
- mkdir -p $${dest}/libstemmer && \
- cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \
- mkdir -p $${dest}/include && \
- mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \
- (cd $${dest} && \
- echo "README" >> MANIFEST && \
- ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \
- ls runtime/*.c runtime/*.h >> MANIFEST && \
- ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \
- ls include/*.h >> MANIFEST) && \
- cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \
- echo 'include mkinc.mak' >> $${dest}/Makefile && \
- echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \
- echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \
- echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \
- echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \
- echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \
- echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \
- echo 'clean:' >> $${dest}/Makefile && \
- echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the Java library.
-dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(JAVA_SOURCES)
- destname=libstemmer_java; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- mkdir -p $${dest} && \
- cp -a doc/libstemmer_java_README $${dest}/README && \
- mkdir -p $${dest}/$(java_src_dir) && \
- cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \
- mkdir -p $${dest}/$(java_src_main_dir) && \
- cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \
- (cd $${dest} && \
- echo "README" >> MANIFEST && \
- ls $(java_src_dir)/*.java >> MANIFEST && \
- ls $(java_src_main_dir)/*.java >> MANIFEST) && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
-
-check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
-
-check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%)
-
-check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)
-
-check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)
-
-# Where the data files are located - assumed their repo is checked out as
-# a sibling to this one.
-STEMMING_DATA = ../snowball-data
-
-check_utf8_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
- @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
- @diff -u $</output.txt tmp.txt
- @if [ -e $</diffs.txt ] ; \
- then \
- ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \
- diff -u $</diffs.txt tmp.txt; \
- fi
- @rm tmp.txt
-
-check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
- ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
-
-check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
- ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
-
-check_koi8r_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
- ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
diff --git a/contrib/snowball/NEWS b/contrib/snowball/NEWS
new file mode 100644
index 000000000..c71c12dd3
--- /dev/null
+++ b/contrib/snowball/NEWS
@@ -0,0 +1,407 @@
+Snowball 2.0.0 (2019-10-02)
+===========================
+
+C/C++
+-----
+
+* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled
+ sequences of any length, but commands which look at the character value only
+ handled sequences up to length 3. Fixes #89.
+
+* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`.
+
+Java
+----
+
+* TestApp.java:
+
+ - Always use UTF-8 for I/O. Patch from David Corbett (#80).
+
+ - Allow reading input from stdin.
+
+ - Remove rather pointless "stem n times" feature.
+
+ - Only lower case ASCII to match stemwords.c.
+
+ - Stem empty lines too to match stemwords.c.
+
+Code Quality Improvements
+-------------------------
+
+* Fix various warnings from newer compilers.
+
+* Improve use of `const`.
+
+* Share common functions between compiler backends rather than having multiple
+ copies of the same code.
+
+* Assorted code clean-up.
+
+* Initialise line_labelled member of struct generator to 0. Previously we were
+ invoking undefined behaviour, though in practice it'll be zero initialised on
+ most platforms.
+
+New Code Generators
+-------------------
+
+* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with
+ additional updates by Dmitry Shachnev.
+
+* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki
+ Shibukawa.
+
+* Add Rust generator from Jakob Demler (#51).
+
+* Add Go generator from Marty Schoch (#57).
+
+* Add C# generator. Based on patch from Cesar Souza (#16, #17).
+
+* Add Pascal generator. Based on Delphi backend from stemming.zip file on old
+ website (#75).
+
+New Language Features
+---------------------
+
+* Add `len` and `lenof` to measure Unicode length. These are similar to `size`
+ and `sizeof` (respectively), but `size` and `sizeof` return the length in
+ bytes under `-utf8`, whereas these new commands give the same result whether
+ using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in
+ the length of the string). For compatibility with existing code which might
+ use these as variable or function names, they stop being treated as tokens if
+ declared to be a variable or function.
+
+* New `{U+1234}` stringdef notation for Unicode codepoints.
+
+* More versatile integer tests. Now you can compare any two arithmetic
+ expressions with a relational operator in parentheses after the `$`, so for
+ example `$(len > 3)` can now be used when previously a temporary variable was
+ required: `$tmp = len $tmp > 3`
+
+Code generation improvements
+----------------------------
+
+* General:
+
+ + Avoid unnecessarily saving and restoring of the cursor for more commands -
+ `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always
+ restore its value, and for C `booltest` (which other languages already
+ handled).
+
+ + Special case handling for `setlimit tomark AE`. All uses of setlimit in
+ the current stemmers we ship follow this pattern, and by special-casing we
+ can avoid having to save and restore the cursor (#74).
+
+ + Merge duplicate actions in the same `among`. This reduces the size of the
+ switch/if-chain in the generated code which dispatch the among for many of
+ the stemmers.
+
+ + Generate simpler code for `among`. We always check for a zero return value
+ when we call the among, so there's no point also checking for that in the
+ switch/if-chain. We can also avoid the switch/if-chain entirely when
+ there's only one possible outcome (besides the zero return).
+
+ + Optimise code generated for `do <function call>`. This speeds up "make
+ check_python" by about 2%, and should speed up other interpreted languages
+ too (#110).
+
+ + Generate more and better comments referencing snowball source.
+
+ + Add homepage URL and compiler version as comments in generated files.
+
+* C/C++:
+
+ + Fix `size` and `sizeof` to not report one too high (reported by Assem
+ Chelli in #32).
+
+ + If signal `f` from a function call would lead to return from the current
+ function then handle this and bailing out on an error together with a
+ simple `if (ret <= 0) return ret;`
+
+ + Inline testing for a single character literals.
+
+ + Avoiding generating `|| 0` in corner case - this can result in a compiler
+ warning when building the generated code.
+
+ + Implement `insert_v()` in terms of `insert_s()`.
+
+ + Add conditional `extern "C"` so `runtime/api.h` can be included from C++
+ code. Closes #90, reported by vvarma.
+
+* Java:
+
+ + Fix functions in `among` to work in Java. We seem to need to make the
+ methods called from among `public` instead of `private`, and to call them
+ on `this` instead of the `methodObject` (which is cleaner anyway). No
+ revision in version control seems to generate working code for this case,
+ but Richard says it definitely used to work - possibly older JVMs failed to
+ correctly enforce the access controls when methods were invoked by
+ reflection.
+
+ + Code after handling `f` by returning from the current function is
+ unreachable too.
+
+ + Previously we incorrectly decided that code after an `or` was
+ unreachable in certain cases. None of the current stemmers in the
+ distribution triggered this, but Martin Porter's snowball version
+ of the Schinke Latin stemmer does. Fixes #58, reported by Alexander
+ Myltsev.
+
+ + The reachability logic was failing to consider reachability from
+ the final command in an `or`. Fixes #82, reported by David Corbett.
+
+ + Fix `maxint` and `minint`. Patch from David Corbett in #31.
+
+ + Fix `$` on strings. The previous generated code was just wrong. This
+ doesn't affect any of the included algorithms, but for example breaks
+ Martin Porter's snowball implementation of Schinke's Latin Stemmer.
+ Issue noted by Jakob Demler while working on the Rust backend in #51,
+ and reported in the Schinke's Latin Stemmer by Alexander Myltsev
+ in #58.
+
+ + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43.
+
+ + Eliminate range-check implementation for groupings. This was removed from
+ the C generator 10 years earlier, isn't used for any of the existing
+ algorithms, and it doesn't seem likely it would be - the grouping would
+ have to consist entirely of a contiguous block of Unicode code-points.
+
+ + Simplify code generated for `repeat` and `atleast`.
+
+ + Eliminate unused return values and variables from runtime functions.
+
+ + Only import the `among` and `SnowballProgram` classes if they're actually
+ used.
+
+ + Only generate `copy_from()` method if it's used.
+
+ + Merge runtime functions `eq_s` and `eq_v` functions.
+
+ + Java arrays know their own length so stop storing it separately.
+
+ + Escape char 127 (DEL) in generated Java code. It's unlikely that this
+ character would actually be used in a real stemmer, so this was more of a
+ theoretical bug.
+
+ + Drop unused import of InvocationTargetException from SnowballStemmer.
+ Reported by GerritDeMeulder in #72.
+
+ + Fix lint check issues in generated Java code. The stemmer classes are only
+ referenced in the example app via reflection, so add
+ @SuppressWarnings("unused") for them. The stemmer classes override
+ equals() and hashCode() methods from the standard java Object class, so
+ mark these with @Override. Both suggested by GerritDeMeulder in #72.
+
+ + Declare Java variables at point of use in generated code. Putting all
+ declarations at the top of the function was adding unnecessary complexity
+ to the Java generator code for no benefit.
+
+ + Improve formatting of generated code.
+
+New stemming algorithms
+-----------------------
+
+* Add Tamil stemmer from Damodharan Rajalingam (#2, #3).
+
+* Add Arabic stemmer from Assem Chelli (#32, #50).
+
+* Add Irish stemmer Jim O'Regan (#48).
+
+* Add Nepali stemmer from Arthur Zakirov (#70).
+
+* Add Indonesian stemmer from Olly Betts (#71).
+
+* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review.
+
+* Add Lithuanian stemmer from Dainius Jocas (#22, #76).
+
+* Add Greek stemmer from Oleg Smirnov (#44).
+
+* Add Catalan and Basque stemmers from Israel Olalla (#104).
+
+Behavioural changes to existing algorithms
+------------------------------------------
+
+* Portuguese:
+
+ + Replace incorrect Spanish suffixes by Portuguese suffixes (#1).
+
+* French:
+
+ + The MSDOS CP850 version of the French algorithm was missing changes present
*** OUTPUT TRUNCATED, 17538 LINES SKIPPED ***
More information about the Commits
mailing list