From 9cdabea4b8ad6636157968de3752f3571ec0532b Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Thu, 14 Mar 2019 11:05:53 +0900
Subject: [PATCH] fix #2036 add langdetect

---
 pom.xml                                       | 12 +++++
 .../codelibs/fess/helper/LanguageHelper.java  | 49 +++++++++++++++++--
 src/main/resources/fess.xml                   |  5 ++
 3 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/pom.xml b/pom.xml
index 486b58775..cc197ec02 100644
--- a/pom.xml
+++ b/pom.xml
@@ -47,6 +47,7 @@
 		<jackson.version>2.8.11</jackson.version>
 		<commons.fileupload.version>1.3.3</commons.fileupload.version>
 		<asm.version>5.1</asm.version>
+		<tika.version>1.20</tika.version>
 
 		<!-- Testing -->
 		<junit.version>4.12</junit.version>
@@ -1320,6 +1321,17 @@
 				</exclusion>
 			</exclusions>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.tika</groupId>
+			<artifactId>tika-langdetect</artifactId>
+			<version>${tika.version}</version>
+			<exclusions>
+				<exclusion>
+					<groupId>javax.annotation</groupId>
+					<artifactId>javax.annotation-api</artifactId>
+				</exclusion>
+			</exclusions>
+		</dependency>
 
 		<!-- suggest library -->
 		<dependency>
diff --git a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
index 323a7c2f2..473b7e274 100644
--- a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
+++ b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
@@ -19,16 +19,24 @@ import java.util.Map;
 
 import javax.annotation.PostConstruct;
 
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.DocumentUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class LanguageHelper {
+    private static final Logger logger = LoggerFactory.getLogger(LanguageHelper.class);
 
     protected String[] langFields;
 
     protected String[] supportedLanguages;
 
+    protected LanguageDetector detector;
+
     @PostConstruct
     public void init() {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
@@ -37,21 +45,52 @@ public class LanguageHelper {
     }
 
     public void updateDocument(final Map<String, Object> doc) {
-        final String language =
-                getSupportedLanguage(DocumentUtil.getValue(doc, ComponentUtil.getFessConfig().getIndexFieldLang(), String.class));
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        String language = getSupportedLanguage(DocumentUtil.getValue(doc, fessConfig.getIndexFieldLang(), String.class));
         if (language == null) {
-            return;
+            for (final String f : langFields) {
+                if (doc.containsKey(f)) {
+                    language = detectLanguage(DocumentUtil.getValue(doc, f, String.class));
+                    if (language != null) {
+                        if (logger.isDebugEnabled()) {
+                            logger.debug("set {} to lang field", language);
+                        }
+                        doc.put(fessConfig.getIndexFieldLang(), language);
+                        break;
+                    }
+                }
+            }
+            if (language == null) {
+                return;
+            }
         }
 
         for (final String f : langFields) {
             final String lf = f + "_" + language;
             if (doc.containsKey(f) && !doc.containsKey(lf)) {
                 doc.put(lf, doc.get(f));
+                if (logger.isDebugEnabled()) {
+                    logger.debug("add {} field", lf);
+                }
             }
         }
     }
 
+    protected String detectLanguage(final String text) {
+        if (StringUtil.isBlank(text)) {
+            return null;
+        }
+        final LanguageResult result = detector.detect(text);
+        if (logger.isDebugEnabled()) {
+            logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
+        }
+        return getSupportedLanguage(result.getLanguage());
+    }
+
     protected String getSupportedLanguage(final String lang) {
+        if (StringUtil.isBlank(lang)) {
+            return null;
+        }
         for (final String l : supportedLanguages) {
             if (l.equals(lang)) {
                 return l;
@@ -60,4 +99,8 @@ public class LanguageHelper {
         return null;
     }
 
+    public void setDetector(LanguageDetector detector) {
+        this.detector = detector;
+    }
+
 }
diff --git a/src/main/resources/fess.xml b/src/main/resources/fess.xml
index d2c30a641..1528580b4 100644
--- a/src/main/resources/fess.xml
+++ b/src/main/resources/fess.xml
@@ -11,6 +11,11 @@
 	<component name="curlHelper" class="org.codelibs.fess.helper.CurlHelper">
 	</component>
 	<component name="languageHelper" class="org.codelibs.fess.helper.LanguageHelper">
+		<property name="detector">
+			<component class="org.apache.tika.langdetect.OptimaizeLangDetector">
+				<postConstruct name="loadModels"></postConstruct>
+			</component>
+		</property>
 	</component>
 	<component name="searchLogHelper" class="org.codelibs.fess.helper.SearchLogHelper">
 		<!-- 
-- 
GitLab