commit b879952: [Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8

Vsevolod Stakhov vsevolod at highsecure.ru
Tue Feb 25 09:56:09 UTC 2020


Author: Vsevolod Stakhov
Date: 2020-02-25 09:55:31 +0000
URL: https://github.com/rspamd/rspamd/commit/b87995255fa2ef0de97d509b8cd27860f014e90f (HEAD -> master)

[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8

---
 contrib/snowball/CMakeLists.txt                    |   35 +-
 contrib/snowball/GNUmakefile                       |  300 ---
 contrib/snowball/NEWS                              |  407 ++++
 contrib/snowball/algorithms/arabic.sbl             |  561 +++++
 contrib/snowball/algorithms/basque.sbl             |  149 ++
 contrib/snowball/algorithms/catalan.sbl            |  202 ++
 .../{danish/stem_ISO_8859_1.sbl => danish.sbl}     |   14 +-
 .../algorithms/danish/stem_MS_DOS_Latin_I.sbl      |   91 -
 .../{dutch/stem_ISO_8859_1.sbl => dutch.sbl}       |   24 +-
 .../algorithms/dutch/stem_MS_DOS_Latin_I.sbl       |  164 --
 .../{english/stem_ISO_8859_1.sbl => english.sbl}   |    0
 .../{finnish/stem_ISO_8859_1.sbl => finnish.sbl}   |   15 +-
 .../{french/stem_ISO_8859_1.sbl => french.sbl}     |   42 +-
 .../algorithms/french/stem_MS_DOS_Latin_I.sbl      |  239 --
 .../{german/stem_ISO_8859_1.sbl => german.sbl}     |   10 +-
 .../algorithms/german/stem_MS_DOS_Latin_I.sbl      |  139 --
 .../{german2/stem_ISO_8859_1.sbl => german2.sbl}   |   10 +-
 contrib/snowball/algorithms/greek.sbl              |  706 ++++++
 contrib/snowball/algorithms/hindi.sbl              |  323 +++
 .../{hungarian/stem_Unicode.sbl => hungarian.sbl}  |   22 +-
 .../algorithms/hungarian/stem_ISO_8859_2.sbl       |  241 --
 contrib/snowball/algorithms/indonesian.sbl         |  192 ++
 contrib/snowball/algorithms/irish.sbl              |  151 ++
 .../stem_MS_DOS_Latin_I.sbl => italian.sbl}        |   24 +-
 .../algorithms/italian/stem_ISO_8859_1.sbl         |  195 --
 .../stem_ISO_8859_1.sbl => kraaij_pohlmann.sbl}    |   31 +-
 contrib/snowball/algorithms/lithuanian.sbl         |  373 +++
 .../{lovins/stem_ISO_8859_1.sbl => lovins.sbl}     |    0
 contrib/snowball/algorithms/nepali.sbl             |   92 +
 .../stem_ISO_8859_1.sbl => norwegian.sbl}          |    8 +-
 .../algorithms/norwegian/stem_MS_DOS_Latin_I.sbl   |   80 -
 .../{porter/stem_ISO_8859_1.sbl => porter.sbl}     |    0
 .../stem_ISO_8859_1.sbl => portuguese.sbl}         |   30 +-
 .../algorithms/portuguese/stem_MS_DOS_Latin_I.sbl  |  218 --
 .../{romanian/stem_ISO_8859_2.sbl => romanian.sbl} |   10 +-
 .../snowball/algorithms/romanian/stem_Unicode.sbl  |  236 --
 .../{russian/stem_KOI8_R.sbl => russian.sbl}       |   76 +-
 .../snowball/algorithms/russian/stem_Unicode.sbl   |  215 --
 contrib/snowball/algorithms/serbian.sbl            | 2378 ++++++++++++++++++++
 .../{spanish/stem_ISO_8859_1.sbl => spanish.sbl}   |   18 +-
 .../algorithms/spanish/stem_MS_DOS_Latin_I.sbl     |  230 --
 .../{swedish/stem_ISO_8859_1.sbl => swedish.sbl}   |    8 +-
 .../algorithms/swedish/stem_MS_DOS_Latin_I.sbl     |   72 -
 contrib/snowball/algorithms/tamil.sbl              |  405 ++++
 .../{turkish/stem_Unicode.sbl => turkish.sbl}      |  201 +-
 contrib/snowball/charsets/ISO-8859-2.sbl           |   98 +
 contrib/snowball/charsets/KOI8-R.sbl               |   74 +
 contrib/snowball/charsets/cp850.sbl                |  130 ++
 contrib/snowball/compiler/analyser.c               |  866 +++++--
 contrib/snowball/compiler/driver.c                 |  466 +++-
 contrib/snowball/compiler/generator.c              | 1158 ++++++----
 contrib/snowball/compiler/generator_java.c         | 1452 ------------
 contrib/snowball/compiler/header.h                 |  215 +-
 contrib/snowball/compiler/space.c                  |   72 +-
 contrib/snowball/compiler/syswords.h               |    6 +-
 contrib/snowball/compiler/syswords2.h              |    4 +-
 contrib/snowball/compiler/tokeniser.c              |  407 ++--
 contrib/snowball/examples/stemwords.c              |  209 --
 contrib/snowball/include/libstemmer.h              |    9 +-
 .../snowball/java/org/tartarus/snowball/Among.java |   31 -
 .../org/tartarus/snowball/SnowballProgram.java     |  432 ----
 .../org/tartarus/snowball/SnowballStemmer.java     |    7 -
 .../java/org/tartarus/snowball/TestApp.java        |   77 -
 contrib/snowball/libstemmer/libstemmer_c.in        |   15 +-
 contrib/snowball/libstemmer/mkmodules.pl           |   25 +-
 contrib/snowball/libstemmer/modules.txt            |   44 +-
 contrib/snowball/runtime/api.c                     |   10 +-
 contrib/snowball/runtime/api.h                     |   10 +-
 contrib/snowball/runtime/header.h                  |    3 +-
 contrib/snowball/runtime/utilities.c               |  191 +-
 70 files changed, 8903 insertions(+), 6045 deletions(-)

diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt
index 7910c7b3a..015f75d1b 100644
--- a/contrib/snowball/CMakeLists.txt
+++ b/contrib/snowball/CMakeLists.txt
@@ -1,20 +1,14 @@
 # End of configuration
-SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian
-	italian norwegian porter portuguese romanian
-	russian spanish swedish turkish)
-SET(KOI8_ALGORITHMS russian)
-SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian
-			norwegian porter portuguese spanish swedish)
-SET(ISO_8859_2_ALGORITHMS hungarian romanian)
-SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins)
-SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS})
+SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian
+	indonesian italian lithuanian nepali norwegian porter portuguese romanian
+	russian serbian spanish swedish tamil turkish)
+SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS})
 
 SET(COMPILER_SOURCES compiler/space.c
 		   compiler/tokeniser.c
 		   compiler/analyser.c
 		   compiler/generator.c
-		   compiler/driver.c
-		   compiler/generator_java.c)
+		   compiler/driver.c)
 
 SET(SNOWBALL_RUNTIME runtime/api.c
 		   runtime/utilities.c)
@@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c)
 #LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
 #LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
 
-SET(STEMWORDS_SOURCES examples/stemwords.c)
 SET(MODULES_H "modules.h")
 CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY)
+ADD_DEFINITIONS("-DDISABLE_JS")
+ADD_DEFINITIONS("-DDISABLE_GO")
+ADD_DEFINITIONS("-DDISABLE_JAVA")
+ADD_DEFINITIONS("-DDISABLE_PYTHON")
+ADD_DEFINITIONS("-DDISABLE_CSHARP")
+ADD_DEFINITIONS("-DDISABLE_PASCAL")
+ADD_DEFINITIONS("-DDISABLE_RUST")
 
 MACRO(gen_stem IN ENCODING)
 	FOREACH(_it ${IN})
@@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING)
 		SET(_header "${_base}.h")
 		SET(_source "${_base}.c")
 		STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}")
-		SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl")
+		SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl")
 		IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input})
 			ADD_CUSTOM_COMMAND(OUTPUT ${_source}
 				COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball  "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u
@@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include")
 ADD_EXECUTABLE(snowball ${COMPILER_SOURCES})
 
 ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h
- COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak)
+ COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak)
 ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h")
 
 SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c")
@@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL)
 ADD_DEPENDENCIES(stemmer_deps modules)
 
 gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8")
-gen_stem("${KOI8_ALGORITHMS}" "KOI8_R")
-gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1")
-gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2")
-
 
 ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES})
 ADD_DEPENDENCIES(stemmer stemmer_deps)
-
-ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES})
-TARGET_LINK_LIBRARIES(stemwords stemmer)
diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile
deleted file mode 100644
index a30cafd89..000000000
--- a/contrib/snowball/GNUmakefile
+++ /dev/null
@@ -1,300 +0,0 @@
-# -*- makefile -*-
-
-c_src_dir = src_c
-java_src_main_dir = java/org/tartarus/snowball
-java_src_dir = $(java_src_main_dir)/ext
-
-libstemmer_algorithms = danish dutch english finnish french german hungarian \
-			italian \
-			norwegian porter portuguese romanian \
-			russian spanish swedish turkish
-
-KOI8_R_algorithms = russian
-ISO_8859_1_algorithms = danish dutch english finnish french german italian \
-			norwegian porter portuguese spanish swedish
-ISO_8859_2_algorithms = hungarian romanian
-
-other_algorithms = german2 kraaij_pohlmann lovins
-
-all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
-
-COMPILER_SOURCES = compiler/space.c \
-		   compiler/tokeniser.c \
-		   compiler/analyser.c \
-		   compiler/generator.c \
-		   compiler/driver.c \
-		   compiler/generator_java.c
-COMPILER_HEADERS = compiler/header.h \
-		   compiler/syswords.h \
-		   compiler/syswords2.h
-
-RUNTIME_SOURCES  = runtime/api.c \
-		   runtime/utilities.c
-RUNTIME_HEADERS  = runtime/api.h \
-		   runtime/header.h
-
-JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
-		      java/org/tartarus/snowball/SnowballProgram.java \
-		      java/org/tartarus/snowball/SnowballStemmer.java \
-		      java/org/tartarus/snowball/TestApp.java
-
-LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
-LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
-LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
-LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
-
-STEMWORDS_SOURCES = examples/stemwords.c
-
-ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
-C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
-		$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
-		$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \
-		$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c)
-C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
-		$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \
-		$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \
-		$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h)
-C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
-C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
-JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
-
-COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
-RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
-LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o)
-LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o)
-STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o)
-C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o)
-C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
-JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
-JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)
-
-CFLAGS=-Iinclude -O2
-CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations
-
-all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS)
-
-clean:
-	rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \
-	      $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \
-	      libstemmer.o stemwords \
-              libstemmer/modules.h \
-              libstemmer/modules_utf8.h \
-              snowball.splint \
-	      $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
-	      $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
-	      $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
-              libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
-              libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
-	rm -rf dist
-	rmdir $(c_src_dir) || true
-
-snowball: $(COMPILER_OBJECTS)
-	$(CC) -o $@ $^
-
-$(COMPILER_OBJECTS): $(COMPILER_HEADERS)
-
-libstemmer/libstemmer.c: libstemmer/libstemmer_c.in
-	sed 's/@MODULES_H@/modules.h/' $^ >$@
-
-libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in
-	sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@
-
-libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt
-	libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak
-
-libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt
-	libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8
-
-libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS)
-
-libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
-	$(AR) -cru $@ $^
-
-stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
-	$(CC) -o $@ $^
-
-algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
-	cp $^ $@
-
-$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball
-	@mkdir -p $(c_src_dir)
-	@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
-	o="$(c_src_dir)/stem_UTF_8_$${l}"; \
-	echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
-	./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u
-
-$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball
-	@mkdir -p $(c_src_dir)
-	@l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \
-	o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
-	echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
-	./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball
-	@mkdir -p $(c_src_dir)
-	@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \
-	o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
-	echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
-	./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball
-	@mkdir -p $(c_src_dir)
-	@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \
-	o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
-	echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
-	./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime
-
-$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
-
-$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
-	@mkdir -p $(java_src_dir)
-	@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
-	o="$(java_src_dir)/$${l}Stemmer"; \
-	echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
-	./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
-
-splint: snowball.splint
-snowball.splint: $(COMPILER_SOURCES)
-	splint $^ >$@ -weak
-
-# Make a full source distribution
-dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
-
-# Make a distribution of all the sources involved in snowball
-dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
-	    $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
-	    $(LIBSTEMMER_SOURCES) \
-	    $(LIBSTEMMER_UTF8_SOURCES) \
-            $(LIBSTEMMER_HEADERS) \
-	    $(LIBSTEMMER_EXTRA) \
-	    $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \
-	    GNUmakefile README doc/TODO libstemmer/mkmodules.pl
-	destname=snowball_code; \
-	dest=dist/$${destname}; \
-	rm -rf $${dest} && \
-	rm -f $${dest}.tgz && \
-	for file in $^; do \
-	  dir=`dirname $$file` && \
-	  mkdir -p $${dest}/$${dir} && \
-	  cp -a $${file} $${dest}/$${dir} || exit 1 ; \
-	done && \
-	(cd dist && tar zcf $${destname}.tgz $${destname}) && \
-	rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the C library.
-dist_libstemmer_c: \
-            $(RUNTIME_SOURCES) \
-            $(RUNTIME_HEADERS) \
-            $(LIBSTEMMER_SOURCES) \
-            $(LIBSTEMMER_UTF8_SOURCES) \
-            $(LIBSTEMMER_HEADERS) \
-            $(LIBSTEMMER_EXTRA) \
-	    $(C_LIB_SOURCES) \
-            $(C_LIB_HEADERS) \
-            libstemmer/mkinc.mak \
-            libstemmer/mkinc_utf8.mak
-	destname=libstemmer_c; \
-	dest=dist/$${destname}; \
-	rm -rf $${dest} && \
-	rm -f $${dest}.tgz && \
-	mkdir -p $${dest} && \
-	cp -a doc/libstemmer_c_README $${dest}/README && \
-	mkdir -p $${dest}/examples && \
-	cp -a examples/stemwords.c $${dest}/examples && \
-	mkdir -p $${dest}/$(c_src_dir) && \
-	cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \
-	mkdir -p $${dest}/runtime && \
-	cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \
-	mkdir -p $${dest}/libstemmer && \
-	cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \
-	mkdir -p $${dest}/include && \
-	mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \
-	(cd $${dest} && \
-	 echo "README" >> MANIFEST && \
-	 ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \
-	 ls runtime/*.c runtime/*.h >> MANIFEST && \
-	 ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \
-	 ls include/*.h >> MANIFEST) && \
-        cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \
-	echo 'include mkinc.mak' >> $${dest}/Makefile && \
-	echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \
-	echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \
-	echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \
-	echo '	$$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \
-	echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \
-	echo '	$$(CC) -o $$@ $$^' >> $${dest}/Makefile && \
-	echo 'clean:' >> $${dest}/Makefile && \
-	echo '	rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \
-	(cd dist && tar zcf $${destname}.tgz $${destname}) && \
-	rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the Java library.
-dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
-            $(LIBSTEMMER_EXTRA) \
-	    $(JAVA_SOURCES)
-	destname=libstemmer_java; \
-	dest=dist/$${destname}; \
-	rm -rf $${dest} && \
-	rm -f $${dest}.tgz && \
-	mkdir -p $${dest} && \
-	cp -a doc/libstemmer_java_README $${dest}/README && \
-	mkdir -p $${dest}/$(java_src_dir) && \
-	cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \
-	mkdir -p $${dest}/$(java_src_main_dir) && \
-	cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \
-	(cd $${dest} && \
-	 echo "README" >> MANIFEST && \
-	 ls $(java_src_dir)/*.java >> MANIFEST && \
-	 ls $(java_src_main_dir)/*.java >> MANIFEST) && \
-	(cd dist && tar zcf $${destname}.tgz $${destname}) && \
-	rm -rf $${dest}
-
-check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
-
-check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
-
-check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%)
-
-check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)
-
-check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)
-
-# Where the data files are located - assumed their repo is checked out as
-# a sibling to this one.
-STEMMING_DATA = ../snowball-data
-
-check_utf8_%: $(STEMMING_DATA)/% stemwords
-	@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
-	@./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
-	@diff -u $</output.txt tmp.txt
-	@if [ -e $</diffs.txt ] ; \
-	then \
-	  ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \
-	  diff -u $</diffs.txt tmp.txt; \
-	fi
-	@rm tmp.txt
-
-check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
-	@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
-	@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
-	    ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
-	@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
-	    diff -u - tmp.txt
-	@rm tmp.txt
-
-check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
-	@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
-	@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
-	    ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
-	@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
-	    diff -u - tmp.txt
-	@rm tmp.txt
-
-check_koi8r_%: $(STEMMING_DATA)/% stemwords
-	@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
-	@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
-	    ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
-	@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
-	    diff -u - tmp.txt
-	@rm tmp.txt
diff --git a/contrib/snowball/NEWS b/contrib/snowball/NEWS
new file mode 100644
index 000000000..c71c12dd3
--- /dev/null
+++ b/contrib/snowball/NEWS
@@ -0,0 +1,407 @@
+Snowball 2.0.0 (2019-10-02)
+===========================
+
+C/C++
+-----
+
+* Fully handle 4-byte UTF-8 sequences.  Previously `hop` and `next` handled
+  sequences of any length, but commands which look at the character value only
+  handled sequences up to length 3.  Fixes #89.
+
+* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`.
+
+Java
+----
+
+* TestApp.java:
+
+  - Always use UTF-8 for I/O.  Patch from David Corbett (#80).
+
+  - Allow reading input from stdin.
+
+  - Remove rather pointless "stem n times" feature.
+
+  - Only lower case ASCII to match stemwords.c.
+
+  - Stem empty lines too to match stemwords.c.
+
+Code Quality Improvements
+-------------------------
+
+* Fix various warnings from newer compilers.
+
+* Improve use of `const`.
+
+* Share common functions between compiler backends rather than having multiple
+  copies of the same code.
+
+* Assorted code clean-up.
+
+* Initialise line_labelled member of struct generator to 0.  Previously we were
+  invoking undefined behaviour, though in practice it'll be zero initialised on
+  most platforms.
+
+New Code Generators
+-------------------
+
+* Add Python generator (#24).  Originally written by Yoshiki Shibukawa, with
+  additional updates by Dmitry Shachnev.
+
+* Add Javascript generator.  Based on JSX generator (#26) written by Yoshiki
+  Shibukawa.
+
+* Add Rust generator from Jakob Demler (#51).
+
+* Add Go generator from Marty Schoch (#57).
+
+* Add C# generator.  Based on patch from Cesar Souza (#16, #17).
+
+* Add Pascal generator.  Based on Delphi backend from stemming.zip file on old
+  website (#75).
+
+New Language Features
+---------------------
+
+* Add `len` and `lenof` to measure Unicode length.  These are similar to `size`
+  and `sizeof` (respectively), but `size` and `sizeof` return the length in
+  bytes under `-utf8`, whereas these new commands give the same result whether
+  using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in
+  the length of the string).  For compatibility with existing code which might
+  use these as variable or function names, they stop being treated as tokens if
+  declared to be a variable or function.
+
+* New `{U+1234}` stringdef notation for Unicode codepoints.
+
+* More versatile integer tests.  Now you can compare any two arithmetic
+  expressions with a relational operator in parentheses after the `$`, so for
+  example `$(len > 3)` can now be used when previously a temporary variable was
+  required: `$tmp = len $tmp > 3`
+
+Code generation improvements
+----------------------------
+
+* General:
+
+  + Avoid unnecessarily saving and restoring of the cursor for more commands -
+    `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always
+    restore its value, and for C `booltest` (which other languages already
+    handled).
+
+  + Special case handling for `setlimit tomark AE`.  All uses of setlimit in
+    the current stemmers we ship follow this pattern, and by special-casing we
+    can avoid having to save and restore the cursor (#74).
+
+  + Merge duplicate actions in the same `among`.  This reduces the size of the
+    switch/if-chain in the generated code which dispatch the among for many of
+    the stemmers.
+
+  + Generate simpler code for `among`.  We always check for a zero return value
+    when we call the among, so there's no point also checking for that in the
+    switch/if-chain.  We can also avoid the switch/if-chain entirely when
+    there's only one possible outcome (besides the zero return).
+
+  + Optimise code generated for `do <function call>`.  This speeds up "make
+    check_python" by about 2%, and should speed up other interpreted languages
+    too (#110).
+
+  + Generate more and better comments referencing snowball source.
+
+  + Add homepage URL and compiler version as comments in generated files.
+
+* C/C++:
+
+  + Fix `size` and `sizeof` to not report one too high (reported by Assem
+    Chelli in #32).
+
+  + If signal `f` from a function call would lead to return from the current
+    function then handle this and bailing out on an error together with a
+    simple `if (ret <= 0) return ret;`
+
+  + Inline testing for a single character literals.
+
+  + Avoiding generating `|| 0` in corner case - this can result in a compiler
+    warning when building the generated code.
+
+  + Implement `insert_v()` in terms of `insert_s()`.
+
+  + Add conditional `extern "C"` so `runtime/api.h` can be included from C++
+    code.  Closes #90, reported by vvarma.
+
+* Java:
+
+  + Fix functions in `among` to work in Java.  We seem to need to make the
+    methods called from among `public` instead of `private`, and to call them
+    on `this` instead of the `methodObject` (which is cleaner anyway).  No
+    revision in version control seems to generate working code for this case,
+    but Richard says it definitely used to work - possibly older JVMs failed to
+    correctly enforce the access controls when methods were invoked by
+    reflection.
+
+  + Code after handling `f` by returning from the current function is
+    unreachable too.
+
+  + Previously we incorrectly decided that code after an `or` was
+    unreachable in certain cases.  None of the current stemmers in the
+    distribution triggered this, but Martin Porter's snowball version
+    of the Schinke Latin stemmer does.  Fixes #58, reported by Alexander
+    Myltsev.
+
+  + The reachability logic was failing to consider reachability from
+    the final command in an `or`.  Fixes #82, reported by David Corbett.
+
+  + Fix `maxint` and `minint`.  Patch from David Corbett in #31.
+
+  + Fix `$` on strings.  The previous generated code was just wrong.  This
+    doesn't affect any of the included algorithms, but for example breaks
+    Martin Porter's snowball implementation of Schinke's Latin Stemmer.
+    Issue noted by Jakob Demler while working on the Rust backend in #51,
+    and reported in the Schinke's Latin Stemmer by Alexander Myltsev
+    in #58.
+
+  + Make SnowballProgram objects serializable.  Patch from Oleg Smirnov in #43.
+
+  + Eliminate range-check implementation for groupings.  This was removed from
+    the C generator 10 years earlier, isn't used for any of the existing
+    algorithms, and it doesn't seem likely it would be - the grouping would
+    have to consist entirely of a contiguous block of Unicode code-points.
+
+  + Simplify code generated for `repeat` and `atleast`.
+
+  + Eliminate unused return values and variables from runtime functions.
+
+  + Only import the `among` and `SnowballProgram` classes if they're actually
+    used.
+
+  + Only generate `copy_from()` method if it's used.
+
+  + Merge runtime functions `eq_s` and `eq_v` functions.
+
+  + Java arrays know their own length so stop storing it separately.
+
+  + Escape char 127 (DEL) in generated Java code.  It's unlikely that this
+    character would actually be used in a real stemmer, so this was more of a
+    theoretical bug.
+
+  + Drop unused import of InvocationTargetException from SnowballStemmer.
+    Reported by GerritDeMeulder in #72.
+
+  + Fix lint check issues in generated Java code.  The stemmer classes are only
+    referenced in the example app via reflection, so add
+    @SuppressWarnings("unused") for them.  The stemmer classes override
+    equals() and hashCode() methods from the standard java Object class, so
+    mark these with @Override.  Both suggested by GerritDeMeulder in #72.
+
+  + Declare Java variables at point of use in generated code.  Putting all
+    declarations at the top of the function was adding unnecessary complexity
+    to the Java generator code for no benefit.
+
+  + Improve formatting of generated code.
+
+New stemming algorithms
+-----------------------
+
+* Add Tamil stemmer from Damodharan Rajalingam (#2, #3).
+
+* Add Arabic stemmer from Assem Chelli (#32, #50).
+
+* Add Irish stemmer Jim O'Regan (#48).
+
+* Add Nepali stemmer from Arthur Zakirov (#70).
+
+* Add Indonesian stemmer from Olly Betts (#71).
+
+* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review.
+
+* Add Lithuanian stemmer from Dainius Jocas (#22, #76).
+
+* Add Greek stemmer from Oleg Smirnov (#44).
+
+* Add Catalan and Basque stemmers from Israel Olalla (#104).
+
+Behavioural changes to existing algorithms
+------------------------------------------
+
+* Portuguese:
+
+  + Replace incorrect Spanish suffixes by Portuguese suffixes (#1).
+
+* French:
+
+  + The MSDOS CP850 version of the French algorithm was missing changes present
*** OUTPUT TRUNCATED, 17538 LINES SKIPPED ***


More information about the Commits mailing list