diff --git a/pom.xml b/pom.xml index 3e11396e0..d0abd3568 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.0 + 1.0.1 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6e1d3c896..52cd7ba2c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9a..e8c75ccf1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; @@ -43,9 +42,9 @@ public class Page { private Map> headers; - private int statusCode = HttpConstant.StatusCode.CODE_200; + private int statusCode; - private boolean downloadSuccess = true; + private boolean downloadSuccess; private byte[] bytes; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a35af70af..a71166421 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -187,7 +187,7 @@ public Spider scheduler(Scheduler scheduler) { */ public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - SpiderScheduler oldScheduler = this.scheduler; + Scheduler oldScheduler = scheduler.getScheduler(); scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; @@ -458,7 +458,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 19cdc33d7..98db3f826 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 15f94cf5e..1fe18e066 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index fec3c1db9..0dabdd954 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,29 +1,13 @@ package us.codecraft.webmagic.scheduler; -import java.io.BufferedReader; -import java.io.Closeable; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; - import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; /** @@ -32,7 +16,7 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable { +public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable { private String filePath = System.getProperty("java.io.tmpdir"); @@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement private BlockingQueue queue; - private Set urls; - private ScheduledExecutorService flushThreadPool; public FileCacheQueueScheduler(String filePath) { @@ -83,36 +65,13 @@ private void init(Task task) { } private void initDuplicateRemover() { - setDuplicateRemover( - new DuplicateRemover() { - @Override - public boolean isDuplicate(Request request, Task task) { - if (!inited.get()) { - init(task); - } - return !urls.add(request.getUrl()); - } - - @Override - public void resetDuplicateCheck(Task task) { - urls.clear(); - } - - @Override - public int getTotalRequestsCount(Task task) { - return urls.size(); - } - }); + BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode()); + setDuplicateRemover(bloomFilterDuplicateRemover); } private void initFlushThread() { - flushThreadPool = Executors.newScheduledThreadPool(1); - flushThreadPool.scheduleAtFixedRate(new Runnable() { - @Override - public void run() { - flush(); - } - }, 10, 10, TimeUnit.SECONDS); + flushThreadPool = Executors.newScheduledThreadPool(1); + flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS); } private void initWriter() { @@ -127,7 +86,6 @@ private void initWriter() { private void readFile() { try { queue = new LinkedBlockingQueue(); - urls = new LinkedHashSet(); readCursorFile(); readUrlFile(); // initDuplicateRemover(); @@ -140,46 +98,43 @@ private void readFile() { } private void readUrlFile() throws IOException { - String line; - BufferedReader fileUrlReader = null; - try { - fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); + try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) { + String line; int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { - urls.add(line.trim()); + Request request = deserializeRequest(line); + this.getDuplicateRemover().isDuplicate(request, null); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(deserializeRequest(line)); + queue.add(request); } } - } finally { - if (fileUrlReader != null) { - IOUtils.closeQuietly(fileUrlReader); - } } } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = null; - try { - fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); + String fileName = getFileName(fileCursor); + try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) { String line; + String lastLine = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { - cursor = new AtomicInteger(NumberUtils.toInt(line)); + line = line.trim(); + if (!line.isEmpty()) { + lastLine = line; + } } - } finally { - if (fileCursorReader != null) { - IOUtils.closeQuietly(fileCursorReader); + if (lastLine != null) { + cursor.set(NumberUtils.toInt(line)); } } } - + public void close() throws IOException { - flushThreadPool.shutdown(); - fileUrlWriter.close(); - fileCursorWriter.close(); - } + flushThreadPool.shutdown(); + fileUrlWriter.close(); + fileCursorWriter.close(); + } private String getFileName(String filename) { return filePath + task.getUUID() + filename; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 91e3698cf..bb18aa2c5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -938,6 +938,7 @@ public Page download(Request request, Task task) { Page page = new Page(); page.setRawText(html); page.setStatusCode(200); + page.setDownloadSuccess(true); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 921161362..76105d330 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 2530bd81d..c206d21a2 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3c03aaf8e..123ac6699 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index a0dc13861..d09deef50 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.0 + 1.0.1 4.0.0