Skip to content

Commit

Permalink
Wip on jaeksoft#1896
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Dec 18, 2017
1 parent 1b0cae1 commit d1fe4ab
Show file tree
Hide file tree
Showing 7 changed files with 889 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ http://sourceforge.net/p/opensearchserve/bug-report/ (SF)
OpenSearchServer 1.6

Enhancement:
- GH-1900: Allow leading wildcard in Pattern query
- GH-1881: Support of array in database crawler
- GH-1879: Parsing of BLOB in the Database crawler
- GH-1870: Crawl cache should use compression
Expand Down
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
release date=12:00 14.09.2017,version=1.6.0,urgency=low,by=Emmanuel Keller <[email protected]>,distribution=unknown
* GH-1900: Allow leading wildcard in Pattern query
* GH-1881: Support of array in database crawler
* GH-1879: Parsing of BLOB in the Database crawler
* GH-1877: ClosedChannelException on Terms extraction
Expand Down
38 changes: 19 additions & 19 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
Expand All @@ -92,7 +92,7 @@
<dependency>
<artifactId>commons-codec</artifactId>
<groupId>commons-codec</groupId>
<version>1.10</version>
<version>1.11</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
Expand All @@ -107,7 +107,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
Expand Down Expand Up @@ -157,12 +157,12 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.14</version>
<version>1.15</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.6</version>
<version>3.7</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
Expand Down Expand Up @@ -406,7 +406,7 @@
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
<version>7.2.0</version>
<version>8.1.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
Expand All @@ -415,7 +415,7 @@
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>59.1</version>
<version>60.2</version>
</dependency>
<dependency>
<groupId>com.google.api.client</groupId>
Expand Down Expand Up @@ -443,7 +443,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
Expand Down Expand Up @@ -504,22 +504,22 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.7</version>
<version>2.0.8</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
<version>1.57</version>
<version>1.58</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk15on</artifactId>
<version>1.57</version>
<version>1.58</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcpkix-jdk15on</artifactId>
<version>1.57</version>
<version>1.58</version>
</dependency>
<!-- PDFBox END -->
<dependency>
Expand Down Expand Up @@ -777,17 +777,17 @@
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15</artifactId>
<version>1.46</version>
<version>1.47</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk15</artifactId>
<version>1.46</version>
<version>1.47</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>antlr4-runtime</artifactId>
<version>4.7</version>
<version>4.7.1</version>
</dependency>
<dependency>
<groupId>net.sf.opencsv</groupId>
Expand Down Expand Up @@ -819,7 +819,7 @@
<artifactId>RoaringBitmap</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<groupId>mylil</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.43</version>
</dependency>
Expand All @@ -841,7 +841,7 @@
<dependency>
<groupId>org.apache.derby</groupId>
<artifactId>derby</artifactId>
<version>10.13.1.1</version>
<version>10.14.1.0</version>
</dependency>
<dependency>
<groupId>com.ingres.jdbc</groupId>
Expand All @@ -856,7 +856,7 @@
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.19.3</version>
<version>3.21.0.1</version>
</dependency>
<dependency>
<groupId>com.qwazr</groupId>
Expand Down Expand Up @@ -899,7 +899,7 @@
<poi.version>3.14</poi.version>
<selenium.version>3.4.0</selenium.version>
<zk.version>6.5.4</zk.version>
<jackson.version>2.9.1</jackson.version>
<jackson.version>2.9.3</jackson.version>
<tomcat.version>7.0.81</tomcat.version>
<slf4j.version>1.7.25</slf4j.version>
<jersey.version>2.25.1</jersey.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collection;
import java.util.Set;

public class HtmlCleanerParser extends HtmlDocumentProvider<Node> {

Expand Down Expand Up @@ -98,7 +97,6 @@ public String findCharset() {
return charsetCache;
} catch (UnsupportedCharsetException e2) {
Logging.warn(e2);
charsetCache = null;
return null;
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
/**
/**
* License Agreement for OpenSearchServer
*
* <p>
* Copyright (C) 2008-2013 Emmanuel Keller / Jaeksoft
*
* <p>
* http://www.open-search-server.com
*
* <p>
* This file is part of OpenSearchServer.
*
* <p>
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* (at your option) any later version.
* <p>
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
* <p>
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/

package com.jaeksoft.searchlib.streamlimiter;

import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;

import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File;
Expand All @@ -33,13 +39,6 @@
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;

import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.StringUtils;

public abstract class StreamLimiter implements Closeable {

private final List<InputStream> inputStreamList;
Expand All @@ -50,45 +49,40 @@ public abstract class StreamLimiter implements Closeable {
private final String originURL;
private String detectedCharset;

protected StreamLimiter(long limit, String originalFileName,
String originURL) throws IOException {
protected StreamLimiter(long limit, String originalFileName, String originURL) {
this.limit = limit;
this.outputCache = null;
this.inputStreamList = new ArrayList<InputStream>(0);
this.tempFiles = new ArrayList<File>(0);
this.inputStreamList = new ArrayList<>(0);
this.tempFiles = new ArrayList<>(0);
this.originalFileName = originalFileName;
this.originURL = originURL;
this.detectedCharset = null;
}

public abstract File getFile() throws IOException, SearchLibException;

final protected void loadOutputCache(InputStream inputStream)
throws LimitException, IOException {
final protected void loadOutputCache(InputStream inputStream) throws LimitException, IOException {
if (outputCache != null)
return;
outputCache = CachedMemoryStream.getCachedStream(inputStream, limit);
}

final protected void loadOutputCache(File file) throws LimitException,
IOException {
final protected void loadOutputCache(File file) throws LimitException, IOException {
if (outputCache != null)
return;
if (file.isDirectory())
return;
outputCache = new CachedFileStream(file, limit);
}

protected abstract void loadOutputCache() throws LimitException,
IOException;
protected abstract void loadOutputCache() throws LimitException, IOException;

public InputStream getNewInputStream() throws IOException {
if (outputCache == null)
loadOutputCache();
if (outputCache == null)
return null;
InputStream inputStream = registerInputStream(outputCache
.getNewInputStream());
InputStream inputStream = registerInputStream(outputCache.getNewInputStream());
if (inputStream.markSupported())
return inputStream;
inputStream = registerInputStream(new BufferedInputStream(inputStream));
Expand All @@ -100,8 +94,7 @@ private final InputStream registerInputStream(final InputStream inputStream) {
return inputStream;
}

public String getMD5Hash() throws NoSuchAlgorithmException, LimitException,
IOException {
public String getMD5Hash() throws NoSuchAlgorithmException, LimitException, IOException {
if (outputCache == null)
loadOutputCache();
if (outputCache == null)
Expand Down
86 changes: 86 additions & 0 deletions src/test/java/com/jaeksoft/searchlib/test/HtmlParserTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* License Agreement for OpenSearchServer
* <p>
* Copyright (C) 2016-2017 Emmanuel Keller / Jaeksoft
* <p>
* http://www.open-search-server.com
* <p>
* This file is part of OpenSearchServer.
* <p>
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* <p>
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* <p>
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.test;

import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.parser.HtmlParser;
import com.jaeksoft.searchlib.parser.ParserFieldEnum;
import com.jaeksoft.searchlib.parser.ParserFieldTarget;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlParserEnum;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiterFile;
import com.jaeksoft.searchlib.util.map.SourceField;
import org.junit.Assert;
import org.junit.Test;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class HtmlParserTest {

private void testGeneratedSourceWithExclusion(final HtmlParserEnum htmlParserEnum) {
try {
final Path htmlFile =
Paths.get("src", "test", "resources", "com", "jaeksoft", "searchlib", "test", "oss.html");
final StreamLimiter streamLimiter =
new StreamLimiterFile(0, htmlFile.toFile(), htmlFile.toUri().toString());
final HtmlParser parser = new HtmlParser();
parser.initProperties();
parser.getFieldMap()
.add(new SourceField("generatedSource"),
new ParserFieldTarget("generatedSource", null, null, false));
parser.setUserProperty(ClassPropertyEnum.XPATH_EXCLUSION.getName(), "//h3");
parser.setUserProperty(ClassPropertyEnum.HTML_PARSER.getName(), htmlParserEnum.getLabel());
parser.doParserContent(null, null, streamLimiter, LanguageEnum.ENGLISH);
final List<ParserResultItem> results = parser.getParserResults();
final String htmlProvider = results.get(0).getFieldValue(ParserFieldEnum.htmlProvider, 0);
Assert.assertEquals(htmlParserEnum.getLabel(), htmlProvider);
final String generatedSource = results.get(0).getFieldValue(ParserFieldEnum.generatedSource, 0);
Assert.assertTrue(generatedSource.contains("<h4>"));
Assert.assertFalse(generatedSource.contains("<h3>"));
} catch (IOException | SearchLibException e) {
Assert.fail(e.getMessage());
}
}

@Test
public void testGeneratedSourceWithExclusionHtmlCleaner() {
testGeneratedSourceWithExclusion(HtmlParserEnum.HtmlCleanerParser);
}

@Test
public void testGeneratedSourceWithExclusionTagSoup() {
testGeneratedSourceWithExclusion(HtmlParserEnum.TagSoupParser);
}

@Test
public void testGeneratedSourceWithExclusionNekoHtml() {
testGeneratedSourceWithExclusion(HtmlParserEnum.NekoHtmlParser);
}
}
Loading

0 comments on commit d1fe4ab

Please sign in to comment.