diff --git a/pom.xml b/pom.xml
index 3e11396e0..d0abd3568 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,7 +12,7 @@
2.2.1
us.codecraft
- 1.0.0
+ 1.0.1
pom
UTF-8
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 6e1d3c896..52cd7ba2c 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index b4c161a9a..e8c75ccf1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -4,7 +4,6 @@
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
-import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
@@ -43,9 +42,9 @@ public class Page {
private Map> headers;
- private int statusCode = HttpConstant.StatusCode.CODE_200;
+ private int statusCode;
- private boolean downloadSuccess = true;
+ private boolean downloadSuccess;
private byte[] bytes;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index a35af70af..a71166421 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -187,7 +187,7 @@ public Spider scheduler(Scheduler scheduler) {
*/
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
- SpiderScheduler oldScheduler = this.scheduler;
+ Scheduler oldScheduler = scheduler.getScheduler();
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
@@ -458,7 +458,6 @@ private void onDownloadSuccess(Request request, Page page) {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
- return;
}
private void onDownloaderFail(Request request) {
diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml
index 19cdc33d7..98db3f826 100644
--- a/webmagic-coverage/pom.xml
+++ b/webmagic-coverage/pom.xml
@@ -10,7 +10,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
webmagic-coverage
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index 15f94cf5e..1fe18e066 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index fec3c1db9..0dabdd954 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -1,29 +1,13 @@
package us.codecraft.webmagic.scheduler;
-import java.io.BufferedReader;
-import java.io.Closeable;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.LinkedHashSet;
-import java.util.Set;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.Executors;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
/**
@@ -32,7 +16,7 @@
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable {
+public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable {
private String filePath = System.getProperty("java.io.tmpdir");
@@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
private BlockingQueue queue;
- private Set urls;
-
private ScheduledExecutorService flushThreadPool;
public FileCacheQueueScheduler(String filePath) {
@@ -83,36 +65,13 @@ private void init(Task task) {
}
private void initDuplicateRemover() {
- setDuplicateRemover(
- new DuplicateRemover() {
- @Override
- public boolean isDuplicate(Request request, Task task) {
- if (!inited.get()) {
- init(task);
- }
- return !urls.add(request.getUrl());
- }
-
- @Override
- public void resetDuplicateCheck(Task task) {
- urls.clear();
- }
-
- @Override
- public int getTotalRequestsCount(Task task) {
- return urls.size();
- }
- });
+ BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode());
+ setDuplicateRemover(bloomFilterDuplicateRemover);
}
private void initFlushThread() {
- flushThreadPool = Executors.newScheduledThreadPool(1);
- flushThreadPool.scheduleAtFixedRate(new Runnable() {
- @Override
- public void run() {
- flush();
- }
- }, 10, 10, TimeUnit.SECONDS);
+ flushThreadPool = Executors.newScheduledThreadPool(1);
+ flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS);
}
private void initWriter() {
@@ -127,7 +86,6 @@ private void initWriter() {
private void readFile() {
try {
queue = new LinkedBlockingQueue();
- urls = new LinkedHashSet();
readCursorFile();
readUrlFile();
// initDuplicateRemover();
@@ -140,46 +98,43 @@ private void readFile() {
}
private void readUrlFile() throws IOException {
- String line;
- BufferedReader fileUrlReader = null;
- try {
- fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
+ try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) {
+ String line;
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
- urls.add(line.trim());
+ Request request = deserializeRequest(line);
+ this.getDuplicateRemover().isDuplicate(request, null);
lineReaded++;
if (lineReaded > cursor.get()) {
- queue.add(deserializeRequest(line));
+ queue.add(request);
}
}
- } finally {
- if (fileUrlReader != null) {
- IOUtils.closeQuietly(fileUrlReader);
- }
}
}
private void readCursorFile() throws IOException {
- BufferedReader fileCursorReader = null;
- try {
- fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
+ String fileName = getFileName(fileCursor);
+ try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) {
String line;
+ String lastLine = null;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
- cursor = new AtomicInteger(NumberUtils.toInt(line));
+ line = line.trim();
+ if (!line.isEmpty()) {
+ lastLine = line;
+ }
}
- } finally {
- if (fileCursorReader != null) {
- IOUtils.closeQuietly(fileCursorReader);
+ if (lastLine != null) {
+ cursor.set(NumberUtils.toInt(line));
}
}
}
-
+
public void close() throws IOException {
- flushThreadPool.shutdown();
- fileUrlWriter.close();
- fileCursorWriter.close();
- }
+ flushThreadPool.shutdown();
+ fileUrlWriter.close();
+ fileCursorWriter.close();
+ }
private String getFileName(String filename) {
return filePath + task.getUUID() + filename;
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 91e3698cf..bb18aa2c5 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -938,6 +938,7 @@ public Page download(Request request, Task task) {
Page page = new Page();
page.setRawText(html);
page.setStatusCode(200);
+ page.setDownloadSuccess(true);
page.setRequest(new Request("https://github.com/code4craft/webmagic"));
page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
return page;
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 921161362..76105d330 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index 2530bd81d..c206d21a2 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
index 3c03aaf8e..123ac6699 100644
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index a0dc13861..d09deef50 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.0
+ 1.0.1
4.0.0