From ba8ae39fdbd9a31f8ac89a91daf4e7e02bd7ab9b Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Thu, 22 Jun 2017 07:14:17 +0900
Subject: [PATCH] fix #1118 validate canonical url

---
 .../transformer/FessXpathTransformer.java     | 30 ++++++++++++++++++-
 .../transformer/FessXpathTransformerTest.java | 15 ++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
index dc75bfe93..edb42bc42 100644
--- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
+++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
@@ -65,6 +65,7 @@ import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.PrunedTag;
 import org.cyberneko.html.parsers.DOMParser;
+import org.hibernate.validator.internal.constraintvalidators.hv.URLValidator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
@@ -219,11 +220,38 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     }
 
+    protected boolean isValidUrl(final String urlStr) {
+        if (StringUtil.isBlank(urlStr)) {
+            return false;
+        }
+        final String value;
+        if (urlStr.startsWith("://")) {
+            value = "http" + urlStr;
+        } else if (urlStr.startsWith("//")) {
+            value = "http:" + urlStr;
+        } else {
+            value = urlStr;
+        }
+        try {
+            final URL url = new java.net.URL(value);
+            final String host = url.getHost();
+            if (StringUtil.isBlank(host)) {
+                return false;
+            }
+            if ("http".equalsIgnoreCase(host) || "https".equalsIgnoreCase(host)) {
+                return false;
+            }
+        } catch (MalformedURLException e) {
+            return false;
+        }
+        return true;
+    }
+
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
         if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
             final String canonicalUrl = getCanonicalUrl(responseData, document);
-            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
+            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) {
                 final Set<RequestData> childUrlSet = new HashSet<>();
                 childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
                 throw new ChildUrlsException(childUrlSet, this.getClass().getName()
diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java
index 1a229fe41..fd5273e8d 100644
--- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java
+++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java
@@ -727,4 +727,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
 
         assertEquals(expected, transformer.getThumbnailUrl(responseData, document));
     }
+
+    public void test_isValidUrl() {
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        assertTrue(transformer.isValidUrl("http://www.example.com"));
+        assertTrue(transformer.isValidUrl("http://www.example.com/aaa"));
+        assertTrue(transformer.isValidUrl("https://www.example.com"));
+        assertTrue(transformer.isValidUrl("://www.example.com"));
+        assertTrue(transformer.isValidUrl("//www.example.com"));
+
+        assertFalse(transformer.isValidUrl(null));
+        assertFalse(transformer.isValidUrl(" "));
+        assertFalse(transformer.isValidUrl("http://"));
+        assertFalse(transformer.isValidUrl("http://http://www.example.com"));
+    }
 }
-- 
GitLab