Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • jonasled/fess-search-engine
1 result
Show changes
Commits on Source (2)
......@@ -216,42 +216,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return;
}
// meta tag
try {
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
if (value != null) {
boolean noindex = false;
boolean nofollow = false;
final String content = value.getTextContent().toLowerCase(Locale.ROOT);
if (content.contains(ROBOTS_TAG_NONE)) {
noindex = true;
nofollow = true;
} else {
if (content.contains(ROBOTS_TAG_NOINDEX)) {
noindex = true;
}
if (content.contains(ROBOTS_TAG_NOFOLLOW)) {
nofollow = true;
}
}
if (noindex && nofollow) {
logger.info("META(robots=noindex,nofollow): {}", responseData.getUrl());
throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
}
if (noindex) {
logger.info("META(robots=noindex): {}", responseData.getUrl());
storeChildUrls(responseData, resultData);
throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
}
if (nofollow) {
logger.info("META(robots=nofollow): {}", responseData.getUrl());
responseData.setNoFollow(true);
}
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of {}", META_NAME_ROBOTS_CONTENT, e);
}
}
protected void processXRobotsTag(final ResponseData responseData, final ResultData resultData) {
......
......@@ -273,261 +273,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String output = getXmlString(pruneNode).replaceAll(".*<BODY>", "").replaceAll("</BODY>.*", "");
assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
}
public void test_processXRobotsTags_no() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer() {
@Override
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processXRobotsTag(responseData, new ResultData());
assertFalse(responseData.isNoFollow());
}
public void test_processXRobotsTag_noindexnofollow() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.addMetaData("X-Robots-Tag", "noindex,nofollow");
try {
transformer.processXRobotsTag(responseData, new ResultData());
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processXRobotsTag_noindex() throws Exception {
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.setResponseBody(data.getBytes());
responseData.addMetaData("X-Robots-Tag", "noindex");
try {
transformer.processXRobotsTag(responseData, new ResultData());
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processXRobotsTag_nofollow() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.addMetaData("X-Robots-Tag", "nofollow");
transformer.processXRobotsTag(responseData, new ResultData());
assertTrue(responseData.isNoFollow());
}
public void test_processMetaRobots_no() throws Exception {
final String data = "<html><body>foo</body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
@Override
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertFalse(responseData.isNoFollow());
}
public void test_processMetaRobots_none() throws Exception {
final String data = "<meta name=\"robots\" content=\"none\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_noindexnofollow() throws Exception {
final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_noindex() throws Exception {
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
protected PathMappingHelper getPathMappingHelper() {
return new PathMappingHelper();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.setResponseBody(data.getBytes());
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_nofollow() throws Exception {
final String data = "<meta name=\"robots\" content=\"nofollow\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
};
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertTrue(responseData.isNoFollow());
}
private Document getDocument(final String data) throws Exception {
final DOMParser parser = new DOMParser();
......