From 8e987df540872cad34b169d9b949831e35da3eb3 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Tue, 8 Aug 2017 06:50:32 +0900
Subject: [PATCH] fix #1211 css attribute selector support
---
.../fess/app/web/base/FessSearchAction.java | 2 +-
.../codelibs/fess/helper/LabelTypeHelper.java | 1 -
.../fess/mylasta/direction/FessProp.java | 37 +++++-----
.../org/codelibs/fess/util/PrunedTag.java | 68 ++++++++++---------
src/main/resources/fess_config.properties | 2 +-
.../fess/mylasta/direction/FessPropTest.java | 39 +++++++++++
6 files changed, 97 insertions(+), 52 deletions(-)
diff --git a/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java b/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java
index b5414cd39..dad7d9172 100644
--- a/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java
+++ b/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java
@@ -15,10 +15,10 @@
*/
package org.codelibs.fess.app.web.base;
+import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
-import java.util.ArrayList;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
diff --git a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java
index 3b3c015bc..83a801fc4 100644
--- a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java
+++ b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java
@@ -35,7 +35,6 @@ import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.LabelTypeService;
import org.codelibs.fess.entity.SearchRequestParams.SearchRequestType;
import org.codelibs.fess.es.config.exentity.LabelType;
-import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
index 3c1aaa784..70d69bed2 100644
--- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
+++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
@@ -35,6 +35,7 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@@ -47,6 +48,7 @@ import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.core.misc.Tuple3;
import org.codelibs.fess.Constants;
+import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.helper.PermissionHelper;
import org.codelibs.fess.mylasta.action.FessUserBean;
import org.codelibs.fess.taglib.FessFunctions;
@@ -635,23 +637,26 @@ public interface FessProp {
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
if (tags == null) {
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
- final String[] cssValues = v.split("\\.", 2);
- final String css;
- if (cssValues.length == 2) {
- css = cssValues[1];
- } else {
- css = null;
- }
-
- final String[] idValues = cssValues[0].split("#", 2);
- final String id;
- if (idValues.length == 2) {
- id = idValues[1];
- } else {
- id = null;
+ final Pattern pattern = Pattern.compile("(\\w+)(\\[[^\\]]+\\])?(\\.\\w+)?(#\\w+)?");
+ final Matcher matcher = pattern.matcher(v.trim());
+ if (matcher.matches()) {
+ final PrunedTag tag = new PrunedTag(matcher.group(1));
+ if (matcher.group(2) != null) {
+ final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1);
+ final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair);
+ if (equalMatcher.matches()) {
+ tag.setAttr(equalMatcher.group(1), equalMatcher.group(2));
+ }
+ }
+ if (matcher.group(3) != null) {
+ tag.setCss(matcher.group(3).substring(1));
+ }
+ if (matcher.group(4) != null) {
+ tag.setId(matcher.group(4).substring(1));
+ }
+ return tag;
}
-
- return new PrunedTag(idValues[0], id, css);
+ throw new FessSystemException("Invalid pruned tag: " + v);
}).toArray(n -> new PrunedTag[n]));
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
}
diff --git a/src/main/java/org/codelibs/fess/util/PrunedTag.java b/src/main/java/org/codelibs/fess/util/PrunedTag.java
index 9c1f224f3..b7692d6ac 100644
--- a/src/main/java/org/codelibs/fess/util/PrunedTag.java
+++ b/src/main/java/org/codelibs/fess/util/PrunedTag.java
@@ -15,24 +15,30 @@
*/
package org.codelibs.fess.util;
+import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.w3c.dom.Node;
public class PrunedTag {
private final String tag;
- private final String id;
- private final String css;
+ private String id;
+ private String css;
+ private String attrName;
+ private String attrValue;
- public PrunedTag(final String tag, final String id, final String css) {
+ public PrunedTag(final String tag) {
this.tag = tag;
- this.id = id;
- this.css = css;
-
}
public boolean matches(final Node node) {
if (tag.equalsIgnoreCase(node.getNodeName())) {
+ if (attrName != null) {
+ Node attr = node.getAttributes().getNamedItem(attrName);
+ if (attr == null || !attrValue.equals(attr.getNodeValue())) {
+ return false;
+ }
+ }
if (id == null) {
if (css == null) {
return true;
@@ -56,11 +62,6 @@ public class PrunedTag {
return false;
}
- @Override
- public String toString() {
- return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
- }
-
@Override
public int hashCode() {
final int prime = 31;
@@ -83,27 +84,28 @@ public class PrunedTag {
return false;
}
final PrunedTag other = (PrunedTag) obj;
- if (css == null) {
- if (other.css != null) {
- return false;
- }
- } else if (!css.equals(other.css)) {
- return false;
- }
- if (id == null) {
- if (other.id != null) {
- return false;
- }
- } else if (!id.equals(other.id)) {
- return false;
- }
- if (tag == null) {
- if (other.tag != null) {
- return false;
- }
- } else if (!tag.equals(other.tag)) {
- return false;
- }
- return true;
+ return StringUtils.compare(tag, other.tag) == 0 //
+ && StringUtils.compare(css, other.css) == 0 //
+ && StringUtils.compare(id, other.id) == 0 //
+ && StringUtils.compare(attrName, other.attrName) == 0 //
+ && StringUtils.compare(attrValue, other.attrValue) == 0;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public void setCss(String css) {
+ this.css = css;
+ }
+
+ public void setAttr(String name, String value) {
+ this.attrName = name;
+ this.attrValue = value;
+ }
+
+ @Override
+ public String toString() {
+ return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + ", attrName=" + attrName + ", attrValue=" + attrValue + "]";
}
}
diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties
index 9e6e3e028..ddf89ccc9 100644
--- a/src/main/resources/fess_config.properties
+++ b/src/main/resources/fess_config.properties
@@ -119,7 +119,7 @@ crawler.document.html.content.xpath=//BODY
crawler.document.html.lang.xpath=//HTML/@lang
crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
-crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav
+crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel="nofollow"]
crawler.document.html.max.digest.length=200
# file
diff --git a/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java b/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java
index bfd3d4b08..0cc4d250a 100644
--- a/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java
+++ b/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java
@@ -15,6 +15,7 @@
*/
package org.codelibs.fess.mylasta.direction;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
@@ -23,7 +24,12 @@ import java.util.HashMap;
import org.codelibs.core.io.FileUtil;
import org.codelibs.core.misc.DynamicProperties;
import org.codelibs.fess.unit.UnitFessTestCase;
+import org.codelibs.fess.util.PrunedTag;
+import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.xml.sax.InputSource;
public class FessPropTest extends UnitFessTestCase {
@@ -120,6 +126,39 @@ public class FessPropTest extends UnitFessTestCase {
assertEquals(12288, spaceChars[1]);
}
+ public void test_getCrawlerDocumentHtmlPrunedTagsAsArray() throws Exception {
+ FessProp.propMap.clear();
+ FessConfig fessConfig = new FessConfig.SimpleImpl() {
+ @Override
+ public String getCrawlerDocumentHtmlPrunedTags() {
+ return "script,div#main,p.image,a[rel=nofollow]";
+ }
+ };
+
+ PrunedTag[] tags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
+ assertTrue(matchesTag(tags[0], "<script></script>"));
+ assertTrue(matchesTag(tags[0], "<script id=\\\"main\\\"></script>"));
+ assertFalse(matchesTag(tags[0], "<a></a>"));
+
+ assertTrue(matchesTag(tags[1], "<div id=\"main\"></div>"));
+ assertFalse(matchesTag(tags[1], "<div></div>"));
+
+ assertTrue(matchesTag(tags[2], "<p class=\"image\"></p>"));
+ assertFalse(matchesTag(tags[2], "<p></p>"));
+
+ assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>"));
+ assertFalse(matchesTag(tags[3], "<a></a>"));
+ }
+
+ private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {
+ final DOMParser parser = new DOMParser();
+ final String html = "<html><body>" + text + "</body></html>";
+ final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
+ parser.parse(new InputSource(is));
+ Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild();
+ return tag.matches(node);
+ }
+
public void test_normalizeQueryLanguages() {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
--
GitLab