From cda8467f9568ff5f532d155a37868c7ed0435654 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:49:43 +0800 Subject: [PATCH 01/13] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 68bf76d9c..cdf618211 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 997eb812c..7fe2ba6ff 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e2c0f741c..c6b70bce1 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 05d6100a6..daf0c7fdc 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 449fcf243..e015567c2 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b73f6fd27..732c23bd0 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3ec15f9af..d1225dda2 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 715d7731b..a430772b6 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.8.1-SNAPSHOT 4.0.0 From faf7e1559aa98a3bae6421fab1396257324e7273 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 20:31:43 +0800 Subject: [PATCH 02/13] Update README for the webmagic version. --- README-zh.md | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README-zh.md b/README-zh.md index 62b3c9a5e..c3c4b72ea 100644 --- a/README-zh.md +++ b/README-zh.md @@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` diff --git a/README.md b/README.md index 14aeac7b1..750a76841 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` From ef616c999e18bb9a7a351049749b3796d3abb977 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 27 Nov 2022 02:05:31 +0800 Subject: [PATCH 03/13] Fix warnings. --- .../webmagic/monitor/SpiderMonitor.java | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index b213dda94..50dbcaf1a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,21 +1,25 @@ package us.codecraft.webmagic.monitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.UrlUtils; - -import javax.management.*; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import javax.management.InstanceAlreadyExistsException; +import javax.management.JMException; +import javax.management.MBeanRegistrationException; +import javax.management.MBeanServer; +import javax.management.MalformedObjectNameException; +import javax.management.NotCompliantMBeanException; +import javax.management.ObjectName; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * @author code4crafer@gmail.com * @since 0.5.0 @@ -23,17 +27,13 @@ @Experimental public class SpiderMonitor { - private static SpiderMonitor INSTANCE = new SpiderMonitor(); - - private AtomicBoolean started = new AtomicBoolean(false); - - private Logger logger = LoggerFactory.getLogger(getClass()); + private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; @@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); + List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { @@ -90,7 +90,7 @@ public void onSuccess(Request request) { } @Override - public void onError(Request request) { + public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } @@ -109,7 +109,6 @@ public List getErrorUrls() { } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { -// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } From 80424b0bd7242ae3f92055baabcedbf6e4a5913b Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Mon, 5 Dec 2022 23:26:01 +0800 Subject: [PATCH 04/13] Replace List with Iterable, fixed #1099. --- .../src/main/java/us/codecraft/webmagic/Page.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..6370171df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -20,7 +20,7 @@ * {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
+ * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader @@ -52,7 +52,7 @@ public class Page { private List targetRequests = new ArrayList(); private String charset; - + public Page() { } @@ -108,7 +108,8 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ - public void setHtml(Html html) { + @Deprecated + public void setHtml(Html html) { this.html = html; } @@ -121,7 +122,7 @@ public List getTargetRequests() { * * @param requests requests */ - public void addTargetRequests(List requests) { + public void addTargetRequests(Iterable requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; @@ -137,7 +138,7 @@ public void addTargetRequests(List requests) { * @param requests requests * @param priority priority */ - public void addTargetRequests(List requests, long priority) { + public void addTargetRequests(Iterable requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; From a266df406ff4641d751c0607d203930fd0e7d7a5 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 20 Dec 2022 23:41:31 +0800 Subject: [PATCH 05/13] Add Site.defaultCharset. closes #1101. --- .../main/java/us/codecraft/webmagic/Site.java | 26 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 9 ++++--- .../java/us/codecraft/webmagic/SiteTest.java | 17 ++++++++++++ 3 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..230337756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -28,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -168,6 +170,30 @@ public String getCharset() { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 72821f3c1..bfd24f01c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; @@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); + logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 000000000..783b82ddc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + +} From 12ce86425f4f5b09be06e49f0d19e84dfa10c54b Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:48:58 +0800 Subject: [PATCH 06/13] =?UTF-8?q?BugFix:=20Jsoup=20=E5=92=8C=20HtmlCleaner?= =?UTF-8?q?=20=E6=9E=84=E5=BB=BA=20Dom=20=E6=97=B6=EF=BC=8C=E8=8B=A5?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20table=20=E6=A0=87=E7=AD=BE=EF=BC=8C?= =?UTF-8?q?=E5=88=99=E6=97=A0=E6=B3=95=E6=AD=A3=E5=B8=B8=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=20tr=20=E5=92=8C=20td=20=E6=A0=87=E7=AD=BE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../selector/BaseElementSelector.java | 10 ++----- .../webmagic/utils/BaseSelectorUtils.java | 23 +++++++++++++++ .../webmagic/selector/Xpath2Selector.java | 28 +++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 19 +++++++++++++ 4 files changed, 60 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5ba9..6001767d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 000000000..04c0651c3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9b0..b63213b62 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -29,13 +30,14 @@ import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * Date: 13-4-21 + * Time: 上午9:39 */ public class Xpath2Selector implements Selector { @@ -111,14 +113,11 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -147,14 +146,11 @@ public String select(String text) { public List selectList(String text) { List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); + result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); + result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; @@ -179,4 +175,12 @@ public List selectList(String text) { } return results; } + + private Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 166188361..c2025e7c6 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,6 +11,9 @@ import org.junit.Ignore; import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -1385,6 +1388,22 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } + @Ignore("test parse or tag directly text = BaseSelectorUtils.preParse(text); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); return new DomSerializer(new CleanerProperties()).createDOM(tagNode); } + } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 8ac721934..4033fcfbd 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,12 +11,15 @@ import org.junit.Ignore; import org.junit.Test; +import org.w3c.dom.Node; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; +import javax.xml.transform.TransformerException; + /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ @@ -1388,23 +1391,6 @@ public void testXpath2Selector() { Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } - @Ignore("test parse
tag") + @Test + public void htmlCleanerParseTest() { + Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); + } + class RuoxiaPageProcessor implements PageProcessor { + @Override + public void process(Page page) { + List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); + for (Selectable node:nodes) { + String name = node.xpath("//td[3]/div/a[1]/text()").get(); + System.out.println(name); + } + } + } + @Ignore("take long time") @Test public void performanceTest() { From 08f4a4046b4cb13a81684533534a7d51640c3e04 Mon Sep 17 00:00:00 2001 From: hooyantsing Date: Fri, 3 Feb 2023 22:59:56 +0800 Subject: [PATCH 07/13] =?UTF-8?q?Update:=20=E6=8F=90=E4=BE=9B=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/selector/XpathSelectorTest.java | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index c2025e7c6..8ac721934 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1393,12 +1393,13 @@ public void testXpath2Selector() { public void htmlCleanerParseTest() { Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); } + class RuoxiaPageProcessor implements PageProcessor { @Override public void process(Page page) { - List nodes = page.getHtml().xpath("//div[@class=\"bd\"]//tbody/tr").nodes(); - for (Selectable node:nodes) { - String name = node.xpath("//td[3]/div/a[1]/text()").get(); + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); + for (String item : items) { + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); System.out.println(name); } } @@ -1408,31 +1409,31 @@ public void process(Page page) { @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } - System.out.println("css "+(System.currentTimeMillis()-time)); + System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @@ -1444,54 +1445,54 @@ public void parserPerformanceTest() throws XPatherException { TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); } From 717931166a5ea6e0931f85cb3efc195982ca7b91 Mon Sep 17 00:00:00 2001 From: hooy <56918789+hooyantsing@users.noreply.github.com> Date: Sat, 11 Feb 2023 02:14:11 +0800 Subject: [PATCH 08/13] =?UTF-8?q?=E5=90=91=20webmagic-saxon=20=E7=BB=84?= =?UTF-8?q?=E4=BB=B6=E6=8F=90=E4=BE=9B=E8=8B=A5=E5=B9=B2=E6=96=B0=20API?= =?UTF-8?q?=EF=BC=8C=E6=9B=B4=E4=BC=98=E9=9B=85=E6=9B=B4=E7=81=B5=E6=B4=BB?= =?UTF-8?q?=E6=9B=B4=E5=BC=BA=E5=A4=A7=20(#1108)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。 --- .../webmagic/selector/JaxpSelectorUtils.java | 61 +++++++ .../webmagic/selector/NodeSelector.java | 32 ++++ .../webmagic/selector/Xpath2Selector.java | 155 ++++++++++-------- .../webmagic/selector/XpathSelectorTest.java | 57 +++++-- 4 files changed, 216 insertions(+), 89 deletions(-) create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java create mode 100644 webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java new file mode 100644 index 000000000..b03f3a2ab --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author hooy + */ +public final class JaxpSelectorUtils { + + private JaxpSelectorUtils() { + throw new RuntimeException("The util class cannot be instanced"); + } + + public static List NodeListToArrayList(NodeList nodes) { + List list = new ArrayList<>(nodes.getLength()); + for (int i = 0; i < nodes.getLength(); i++) { + list.add(nodes.item(i)); + } + return list; + } + + public static String nodeToString(Node node) throws TransformerException { + List before = Collections.singletonList(node); + List after = nodesToStrings(before); + if (after.size() > 0) { + return after.get(0); + } else { + return null; + } + } + + public static List nodesToStrings(List nodes) throws TransformerException { + List results = new ArrayList<>(nodes.size()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (Node node : nodes) { + if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { + results.add(node.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(node), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + return results; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java new file mode 100644 index 000000000..3e6339dda --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; + +import java.util.List; + +/** + * Selector(extractor) for html node.
+ * + * @author hooy
+ * @since 0.8.0 + */ +public interface NodeSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param node node + * @return result + */ + String select(Node node); + + /** + * Extract all results in text.
+ * + * @param node node + * @return results + */ + List selectList(Node node); + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index b63213b62..6c5d7b332 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,19 +1,10 @@ package us.codecraft.webmagic.selector; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; @@ -32,20 +23,22 @@ import net.sf.saxon.xpath.XPathEvaluator; import us.codecraft.webmagic.utils.BaseSelectorUtils; +import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; + /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* - * @author code4crafter@gmail.com
+ * @author code4crafter@gmail.com, hooy
* Date: 13-4-21 * Time: 上午9:39 */ -public class Xpath2Selector implements Selector { +public class Xpath2Selector implements Selector, NodeSelector { - private String xpathStr; + private final String xpathStr; private XPathExpression xPathExpression; - private Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; @@ -56,25 +49,25 @@ public Xpath2Selector(String xpathStr) { } } + public static Xpath2Selector newInstance(String xpathStr) { + return new Xpath2Selector(xpathStr); + } + enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; - private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); - private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); - List prefixes = namespace2PrefixMap.get(namespaceURI); - if (prefixes == null) { - prefixes = new ArrayList(); - namespace2PrefixMap.put(namespaceURI, prefixes); - } + List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } - private XPath2NamespaceContext() { + XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); @@ -113,29 +106,18 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - if (nodeList.getLength() == 0) { - return null; - } - Node item = nodeList.item(0); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - return item.getTextContent(); - } else { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(item), xmlOutput); - return xmlOutput.getWriter().toString(); - } - } - return result.toString(); + Document doc = parse(text); + return select(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public String select(Node node) { + try { + return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } @@ -144,43 +126,72 @@ public String select(String text) { @Override public List selectList(String text) { - List results = new ArrayList(); try { - Object result; - try { - result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - StreamResult xmlOutput = new StreamResult(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - for (int i = 0; i < nodeList.getLength(); i++) { - Node item = nodeList.item(i); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - results.add(item.getTextContent()); - } else { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(item), xmlOutput); - results.add(xmlOutput.getWriter().toString()); - } - } - } else { - results.add(result.toString()); - } + Document doc = parse(text); + return selectList(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + List nodes = NodeListToArrayList(result); + return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } - return results; + return null; } - private Document parse(String text) throws ParserConfigurationException { + public Node selectNode(String text) { + try { + Document doc = parse(text); + return selectNode(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(Node node) { + try { + return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(String text) { + try { + Document doc = parse(text); + return selectNodes(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + return NodeListToArrayList(result); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + protected static Document parse(String text) throws ParserConfigurationException { // HtmlCleaner could not parse
tag") - @Test - public void htmlCleanerParseTest() { - Spider.create(new RuoxiaPageProcessor()).addUrl("http://www.ruoxia.com/top/dianji/month").thread(1).run(); - } - - class RuoxiaPageProcessor implements PageProcessor { - @Override - public void process(Page page) { - List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(page.getRawText()); - for (String item : items) { - String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(item); - System.out.println(name); - } - } - } - @Ignore("take long time") @Test public void performanceTest() { @@ -1496,4 +1482,41 @@ public void parserPerformanceTest() throws XPatherException { } + /** + * New api test + * + * @author hooy + * @since 8.0 + */ + private String rank = "

点击榜

排名分类书名/最新章节作者推荐更新时间
1.现实
0
11-24 22:32
2.架空
1047
03-04 14:44
3.现实
0
07-20 09:06
4.豪门
0
12-03 09:12
5.现实
0
02-01 21:12
6.玄奇
3455
02-28 12:31
7.玄奇
20614
03-31 12:37
8.复仇
55
06-03 11:43
9.穿越
0
10-27 18:50
10.宫斗
320
10-31 13:58
11.宫斗
6268
07-12 20:23
12.现实
0
01-18 23:00
13.婚恋
0
12-14 20:50
14.修真
0
02-03 23:40
15.豪门
0
11-06 23:38
16.穿越
191
12-02 23:37
17.穿越
412
10-13 22:39
18.豪门
635
07-01 13:15
19.架空
144
06-18 09:35
20.宅斗
1032
08-15 19:03
21.宫斗
0
09-30 20:32
22.豪门
0
06-05 11:31
23.重生
80
11-25 19:56
24.异世
68
01-12 10:06
25.豪门
0
05-29 18:46
26.婚恋
2778
11-04 17:48
27.玄奇
207
12-06 16:57
28.穿越
260
01-04 23:26
29.豪门
0
12-07 21:39
30.架空
1127
06-06 17:28
31.穿越
113
09-13 09:06
32.架空
597
02-14 18:47
33.玄奇
528
06-04 22:04
34.穿越
328
06-06 22:09
35.架空
539
05-24 14:42
36.架空
0
03-05 23:27
37.穿越
3215
08-21 16:38
38.宫斗
905
08-04 20:24
39.玄奇
1328
07-25 10:58
40.穿越
203
01-27 20:53
41.宫斗
407
08-31 09:03
42.宅斗
16
05-03 17:38
43.豪门
0
11-10 08:00
44.婚恋
0
07-12 21:37
45.架空
0
06-23 21:02
46.玄奇
1382
05-31 20:36
47.重生
334
07-16 19:19
48.婚恋
505
11-01 16:42
49.婚恋
0
10-19 18:32
50.豪门
540
09-19 19:18
51.婚恋
226
03-18 13:09
52.穿越
1026
03-08 16:28
53.重生
304
02-19 10:25
54.玄奇
2617
02-15 20:57
55.穿越
199
09-04 19:43
56.同人
768
07-19 20:00
57.宅斗
0
02-13 18:13
58.豪门
0
11-12 22:23
59.架空
0
07-28 23:42
60.婚恋
0
02-03 23:09
61.豪门
285
01-07 19:21
62.重生
654
10-12 18:16
63.异能
617
06-18 20:23
64.宫斗
27
06-02 21:05
65.种田
206
08-31 19:23
66.宅斗
2444
08-19 15:51
67.宅斗
818
08-07 23:38
68.现代
0
12-23 17:02
69.玄奇
0
07-23 12:00
70.婚恋
0
11-01 16:43
71.豪门
0
09-12 00:01
72.架空
0
04-27 22:42
73.豪门
0
04-19 13:55
74.异能
62
07-30 00:00
75.穿越
1307
07-20 16:41
76.玄奇
12820
07-15 23:46
77.架空
828
06-06 17:54
78.宅斗
985
05-20 23:53
79.玄奇
4960
04-12 15:58
80.玄奇
245
03-02 23:11
81.宅斗
34
12-21 10:11
82.宅斗
1411
07-21 00:00
83.现代
0
07-31 10:10
84.玄奇
0
06-18 13:53
85.架空
0
12-03 23:41
86.玄奇
0
11-28 22:13
87.豪门
0
11-07 22:48
88.婚恋
0
08-29 23:15
89.种田
1831
08-21 16:38
90.豪门
0
07-11 21:25
91.豪门
0
06-13 15:37
92.豪门
0
05-07 22:10
93.豪门
0
02-28 00:01
94.豪门
304
12-16 07:30
95.婚恋
669
11-07 18:16
96.仙侠
54
09-25 19:51
97.豪门
655
08-31 13:02
98.现实
374
06-29 09:55
99.穿越
373
06-19 18:07
100.婚恋
159
06-04 21:05
"; + + @Test + public void testStringAPI() { + // testAPI: selectList(String) -> selectList(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); + Assert.assertSame(100, items.size()); + // testAPI: select(String) -> select(Node) + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testNodeAPI() { + // testAPI: selectNodes(String) -> selectNodes(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); + Assert.assertSame(100, items.size()); + // testAPI: selectNode(Node) + Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); + String name = new Xpath2Selector("./text()").select(item); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testUtilAPI() throws TransformerException { + Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); + // testAPI: nodeToString(Node) -> nodesToStrings(List) + String name = JaxpSelectorUtils.nodeToString(item); + Assert.assertEquals("深宫安容传", name); + } + } From 244ade7b4c88d21bd676a5ea128a8ac2a8f53456 Mon Sep 17 00:00:00 2001 From: Tanky-Zhang <48041180+Tanky-Zhang@users.noreply.github.com> Date: Wed, 22 Mar 2023 22:25:51 +0800 Subject: [PATCH 09/13] feat:update host verify (#1112) --- .../downloader/HttpClientGenerator.java | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f1085..167a5e1c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,16 +1,5 @@ package us.codecraft.webmagic.downloader; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; - -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; @@ -22,28 +11,32 @@ import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.Site; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private transient Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; @@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { SSLContext sslContext = createIgnoreVerifySSL(); String[] supportedProtocols; if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; } else { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; } logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, - new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { - logger.error("ssl connection fail", e); - } catch (NoSuchAlgorithmException e) { + //不进行主机校验 + (host, sslSession) -> true); // 优先绕过安全证书 + } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); - } + } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 @@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() { }; SSLContext sc = SSLContext.getInstance("TLS"); - sc.init(null, new TrustManager[] { trustManager }, null); + sc.init(null, new TrustManager[]{trustManager}, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); From 58fd08bcf83909fe713f9a5db24d30b8c30a5824 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 00:27:30 +0800 Subject: [PATCH 10/13] Expose Request to ProxyProvider. --- .../downloader/HttpClientDownloader.java | 2 +- .../webmagic/proxy/ProxyProvider.java | 19 ++++++++++++++++++- .../webmagic/proxy/SimpleProxyProvider.java | 3 ++- .../proxy/SimpleProxyProviderTest.java | 2 +- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bfd24f01c..2f3ef58ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -77,7 +77,7 @@ public Page download(Request request, Task task) { } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); - Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; + Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed42..8eab4d6de 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** @@ -23,7 +24,23 @@ public interface ProxyProvider { * Get a proxy for task by some strategy. * @param task the download task * @return proxy + * @deprecated Use {@link #getProxy(Request, Task)} instead. */ - Proxy getProxy(Task task); + @Deprecated + default Proxy getProxy(Task task) { + throw new UnsupportedOperationException(); + } + + /** + * Returns a proxy for the request. + * + * @param request the request + * @param task the download task + * @return proxy + * @since 0.9.0 + */ + default Proxy getProxy(Request request, Task task) { + return this.getProxy(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a88c..f4c3f73bb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) { } @Override - public Proxy getProxy(Task task) { + public Proxy getProxy(Request request, Task task) { return proxies.get(incrForLoop()); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 6495b16bf..8fda56ea9 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -20,7 +20,7 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(TASK); + Proxy proxy = proxyProvider.getProxy(null, TASK); assertThat(proxy).isEqualTo(originProxy1); proxy = proxyProvider.getProxy(TASK); assertThat(proxy).isEqualTo(originProxy2); From a5fb4e041476ecd93346b3bc41354d5b29c6ae13 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 10:40:37 +0800 Subject: [PATCH 11/13] Upgrade dependencies. --- pom.xml | 2 +- webmagic-extension/pom.xml | 5 +++++ webmagic-samples/pom.xml | 8 ++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index cdf618211..c81f4c557 100644 --- a/pom.xml +++ b/pom.xml @@ -124,7 +124,7 @@ us.codecraft xsoup - 0.3.6 + 0.3.7 com.alibaba diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index daf0c7fdc..8c7fdb3d2 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -14,6 +14,11 @@ redis.clients jedis + + org.assertj + assertj-core + test + com.google.guava guava diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index e015567c2..9e1623018 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -27,22 +27,22 @@ org.mapdb mapdb - 3.0.8 + 3.0.9 com.fasterxml.jackson.core jackson-core - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-annotations - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-databind - 2.13.4.2 + 2.15.2 From ad010927f6acb5c605e4befe269076629d8d4357 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 10:40:46 +0800 Subject: [PATCH 12/13] Fix test. --- .../webmagic/proxy/SimpleProxyProviderTest.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 8fda56ea9..e9325a7a7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.proxy; import org.junit.Test; +import org.mockito.Mockito; + +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; @@ -20,11 +23,12 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(null, TASK); + Request request = Mockito.mock(Request.class); + Proxy proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy2); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); } } From 3688226e327266cb3cfd1a1e3777ad94ad68d6f5 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 22 Jun 2023 11:16:41 +0800 Subject: [PATCH 13/13] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index c81f4c557..35c0c9bd0 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 7fe2ba6ff..b4feb1671 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index c6b70bce1..a0a5ffb48 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 8c7fdb3d2..7cf0aa617 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 9e1623018..e42e1fcd8 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 732c23bd0..c5238760b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d1225dda2..0019ea3c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a430772b6..63682001f 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.1-SNAPSHOT + 0.9.0 4.0.0