diff --git a/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java b/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java index b5414cd39c424b7ce7ca6b7064eb2c9870904b93..dad7d9172da3517c1559cfd945cc6fbf601edab7 100644 --- a/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java +++ b/src/main/java/org/codelibs/fess/app/web/base/FessSearchAction.java @@ -15,10 +15,10 @@ */ package org.codelibs.fess.app.web.base; +import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; -import java.util.ArrayList; import java.util.Locale; import java.util.Map; import java.util.Set; diff --git a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java index 3b3c015bcad45bd46f472ddb1045ece644460837..83a801fc4aaa965b3a9810428fe52852e1780800 100644 --- a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java +++ b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java @@ -35,7 +35,6 @@ import org.codelibs.fess.Constants; import org.codelibs.fess.app.service.LabelTypeService; import org.codelibs.fess.entity.SearchRequestParams.SearchRequestType; import org.codelibs.fess.es.config.exentity.LabelType; -import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java index 3c1aaa78459072c10ff3a76c971db095d55a9241..70d69bed259b59bda2cfebed8a597f20b2d775d3 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java @@ -35,6 +35,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; +import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -47,6 +48,7 @@ import org.codelibs.core.lang.StringUtil; import org.codelibs.core.misc.Pair; import org.codelibs.core.misc.Tuple3; import org.codelibs.fess.Constants; +import org.codelibs.fess.exception.FessSystemException; import org.codelibs.fess.helper.PermissionHelper; import org.codelibs.fess.mylasta.action.FessUserBean; import org.codelibs.fess.taglib.FessFunctions; @@ -635,23 +637,26 @@ public interface FessProp { PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags"); if (tags == null) { tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> { - final String[] cssValues = v.split("\\.", 2); - final String css; - if (cssValues.length == 2) { - css = cssValues[1]; - } else { - css = null; - } - - final String[] idValues = cssValues[0].split("#", 2); - final String id; - if (idValues.length == 2) { - id = idValues[1]; - } else { - id = null; + final Pattern pattern = Pattern.compile("(\\w+)(\\[[^\\]]+\\])?(\\.\\w+)?(#\\w+)?"); + final Matcher matcher = pattern.matcher(v.trim()); + if (matcher.matches()) { + final PrunedTag tag = new PrunedTag(matcher.group(1)); + if (matcher.group(2) != null) { + final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1); + final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair); + if (equalMatcher.matches()) { + tag.setAttr(equalMatcher.group(1), equalMatcher.group(2)); + } + } + if (matcher.group(3) != null) { + tag.setCss(matcher.group(3).substring(1)); + } + if (matcher.group(4) != null) { + tag.setId(matcher.group(4).substring(1)); + } + return tag; } - - return new PrunedTag(idValues[0], id, css); + throw new FessSystemException("Invalid pruned tag: " + v); }).toArray(n -> new PrunedTag[n])); propMap.put("crawlerDocumentHtmlPrunedTags", tags); } diff --git a/src/main/java/org/codelibs/fess/util/PrunedTag.java b/src/main/java/org/codelibs/fess/util/PrunedTag.java index 9c1f224f369e10b42f84a5725b2dd35b7f78dc35..b7692d6acc71d40689fb5fab1b47fc76e6feda70 100644 --- a/src/main/java/org/codelibs/fess/util/PrunedTag.java +++ b/src/main/java/org/codelibs/fess/util/PrunedTag.java @@ -15,24 +15,30 @@ */ package org.codelibs.fess.util; +import org.apache.commons.lang3.StringUtils; import org.codelibs.core.lang.StringUtil; import org.codelibs.core.stream.StreamUtil; import org.w3c.dom.Node; public class PrunedTag { private final String tag; - private final String id; - private final String css; + private String id; + private String css; + private String attrName; + private String attrValue; - public PrunedTag(final String tag, final String id, final String css) { + public PrunedTag(final String tag) { this.tag = tag; - this.id = id; - this.css = css; - } public boolean matches(final Node node) { if (tag.equalsIgnoreCase(node.getNodeName())) { + if (attrName != null) { + Node attr = node.getAttributes().getNamedItem(attrName); + if (attr == null || !attrValue.equals(attr.getNodeValue())) { + return false; + } + } if (id == null) { if (css == null) { return true; @@ -56,11 +62,6 @@ public class PrunedTag { return false; } - @Override - public String toString() { - return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]"; - } - @Override public int hashCode() { final int prime = 31; @@ -83,27 +84,28 @@ public class PrunedTag { return false; } final PrunedTag other = (PrunedTag) obj; - if (css == null) { - if (other.css != null) { - return false; - } - } else if (!css.equals(other.css)) { - return false; - } - if (id == null) { - if (other.id != null) { - return false; - } - } else if (!id.equals(other.id)) { - return false; - } - if (tag == null) { - if (other.tag != null) { - return false; - } - } else if (!tag.equals(other.tag)) { - return false; - } - return true; + return StringUtils.compare(tag, other.tag) == 0 // + && StringUtils.compare(css, other.css) == 0 // + && StringUtils.compare(id, other.id) == 0 // + && StringUtils.compare(attrName, other.attrName) == 0 // + && StringUtils.compare(attrValue, other.attrValue) == 0; + } + + public void setId(String id) { + this.id = id; + } + + public void setCss(String css) { + this.css = css; + } + + public void setAttr(String name, String value) { + this.attrName = name; + this.attrValue = value; + } + + @Override + public String toString() { + return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + ", attrName=" + attrName + ", attrValue=" + attrValue + "]"; } } diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index 9e6e3e02894a89140297c9f39be18ff0c41e7685..ddf89ccc99afacf1b5dab268b254b151f1a21c53 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -119,7 +119,7 @@ crawler.document.html.content.xpath=//BODY crawler.document.html.lang.xpath=//HTML/@lang crawler.document.html.digest.xpath=//META[@name='description']/@content crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href -crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav +crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel="nofollow"] crawler.document.html.max.digest.length=200 # file diff --git a/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java b/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java index bfd3d4b08fc87de3ff732ca3cc5d70f603f544d5..0cc4d250a59c2c49c4874638cce11ed5dffaa23c 100644 --- a/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java +++ b/src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java @@ -15,6 +15,7 @@ */ package org.codelibs.fess.mylasta.direction; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -23,7 +24,12 @@ import java.util.HashMap; import org.codelibs.core.io.FileUtil; import org.codelibs.core.misc.DynamicProperties; import org.codelibs.fess.unit.UnitFessTestCase; +import org.codelibs.fess.util.PrunedTag; +import org.cyberneko.html.parsers.DOMParser; import org.lastaflute.di.core.factory.SingletonLaContainerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.xml.sax.InputSource; public class FessPropTest extends UnitFessTestCase { @@ -120,6 +126,39 @@ public class FessPropTest extends UnitFessTestCase { assertEquals(12288, spaceChars[1]); } + public void test_getCrawlerDocumentHtmlPrunedTagsAsArray() throws Exception { + FessProp.propMap.clear(); + FessConfig fessConfig = new FessConfig.SimpleImpl() { + @Override + public String getCrawlerDocumentHtmlPrunedTags() { + return "script,div#main,p.image,a[rel=nofollow]"; + } + }; + + PrunedTag[] tags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray(); + assertTrue(matchesTag(tags[0], "<script></script>")); + assertTrue(matchesTag(tags[0], "<script id=\\\"main\\\"></script>")); + assertFalse(matchesTag(tags[0], "<a></a>")); + + assertTrue(matchesTag(tags[1], "<div id=\"main\"></div>")); + assertFalse(matchesTag(tags[1], "<div></div>")); + + assertTrue(matchesTag(tags[2], "<p class=\"image\"></p>")); + assertFalse(matchesTag(tags[2], "<p></p>")); + + assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>")); + assertFalse(matchesTag(tags[3], "<a></a>")); + } + + private boolean matchesTag(final PrunedTag tag, final String text) throws Exception { + final DOMParser parser = new DOMParser(); + final String html = "<html><body>" + text + "</body></html>"; + final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8")); + parser.parse(new InputSource(is)); + Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild(); + return tag.matches(node); + } + public void test_normalizeQueryLanguages() { FessProp.propMap.clear(); FessConfig fessConfig = new FessConfig.SimpleImpl() {