From 9cdabea4b8ad6636157968de3752f3571ec0532b Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya <shinsuke@apache.org> Date: Thu, 14 Mar 2019 11:05:53 +0900 Subject: [PATCH] fix #2036 add langdetect --- pom.xml | 12 +++++ .../codelibs/fess/helper/LanguageHelper.java | 49 +++++++++++++++++-- src/main/resources/fess.xml | 5 ++ 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 486b58775..cc197ec02 100644 --- a/pom.xml +++ b/pom.xml @@ -47,6 +47,7 @@ <jackson.version>2.8.11</jackson.version> <commons.fileupload.version>1.3.3</commons.fileupload.version> <asm.version>5.1</asm.version> + <tika.version>1.20</tika.version> <!-- Testing --> <junit.version>4.12</junit.version> @@ -1320,6 +1321,17 @@ </exclusion> </exclusions> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-langdetect</artifactId> + <version>${tika.version}</version> + <exclusions> + <exclusion> + <groupId>javax.annotation</groupId> + <artifactId>javax.annotation-api</artifactId> + </exclusion> + </exclusions> + </dependency> <!-- suggest library --> <dependency> diff --git a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java index 323a7c2f2..473b7e274 100644 --- a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java +++ b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java @@ -19,16 +19,24 @@ import java.util.Map; import javax.annotation.PostConstruct; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; +import org.codelibs.core.lang.StringUtil; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.codelibs.fess.util.DocumentUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class LanguageHelper { + private static final Logger logger = LoggerFactory.getLogger(LanguageHelper.class); protected String[] langFields; protected String[] supportedLanguages; + protected LanguageDetector detector; + @PostConstruct public void init() { final FessConfig fessConfig = ComponentUtil.getFessConfig(); @@ -37,21 +45,52 @@ public class LanguageHelper { } public void updateDocument(final Map<String, Object> doc) { - final String language = - getSupportedLanguage(DocumentUtil.getValue(doc, ComponentUtil.getFessConfig().getIndexFieldLang(), String.class)); + final FessConfig fessConfig = ComponentUtil.getFessConfig(); + String language = getSupportedLanguage(DocumentUtil.getValue(doc, fessConfig.getIndexFieldLang(), String.class)); if (language == null) { - return; + for (final String f : langFields) { + if (doc.containsKey(f)) { + language = detectLanguage(DocumentUtil.getValue(doc, f, String.class)); + if (language != null) { + if (logger.isDebugEnabled()) { + logger.debug("set {} to lang field", language); + } + doc.put(fessConfig.getIndexFieldLang(), language); + break; + } + } + } + if (language == null) { + return; + } } for (final String f : langFields) { final String lf = f + "_" + language; if (doc.containsKey(f) && !doc.containsKey(lf)) { doc.put(lf, doc.get(f)); + if (logger.isDebugEnabled()) { + logger.debug("add {} field", lf); + } } } } + protected String detectLanguage(final String text) { + if (StringUtil.isBlank(text)) { + return null; + } + final LanguageResult result = detector.detect(text); + if (logger.isDebugEnabled()) { + logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text); + } + return getSupportedLanguage(result.getLanguage()); + } + protected String getSupportedLanguage(final String lang) { + if (StringUtil.isBlank(lang)) { + return null; + } for (final String l : supportedLanguages) { if (l.equals(lang)) { return l; @@ -60,4 +99,8 @@ public class LanguageHelper { return null; } + public void setDetector(LanguageDetector detector) { + this.detector = detector; + } + } diff --git a/src/main/resources/fess.xml b/src/main/resources/fess.xml index d2c30a641..1528580b4 100644 --- a/src/main/resources/fess.xml +++ b/src/main/resources/fess.xml @@ -11,6 +11,11 @@ <component name="curlHelper" class="org.codelibs.fess.helper.CurlHelper"> </component> <component name="languageHelper" class="org.codelibs.fess.helper.LanguageHelper"> + <property name="detector"> + <component class="org.apache.tika.langdetect.OptimaizeLangDetector"> + <postConstruct name="loadModels"></postConstruct> + </component> + </property> </component> <component name="searchLogHelper" class="org.codelibs.fess.helper.SearchLogHelper"> <!-- -- GitLab