Skip to content

Commit

Permalink
api change
Browse files Browse the repository at this point in the history
  • Loading branch information
flyingeek committed Mar 31, 2020
1 parent 4b293f1 commit 710b527
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 73 deletions.
43 changes: 11 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,9 @@ npm run build

## Running from Scriptable

Copy `dist/scriptable-pdfjs.html` into Scriptable Document Folder.
Copy the file `dist/scriptable-pdfjs.html` into Scriptable Document Folder.

```javascript
const fm = FileManager.iCloud();
const wv = new WebView();
const htmlFileUrl = fm.joinPath(fm.documentsDirectory(), "scriptable-pdfjs.html");
await wv.loadFile(htmlFileUrl);

/*
In the WebView your javascript will have access to the pdfjs global var.
pdfjs.pdfjsLib is the pdfjs module
pdfjs.getPDFText is a convenience wrapper
You have to pass the pdf file as a base64 string
*/

let javascript = 'pdfjs.getPDFText(';
javascript += '"' + fm.read(pdfFilePath).toBase64String() + '"';
javascript += ');'

let result = "";
try {
result = await wv.evaluateJavaScript(javascript, true);
} catch (e) {
//...
}
//...
```
and use [scriptable-pdfjs-demo](https://gist.github.com/flyingeek/70f5e09887f17dbfcd11a4b620a68b28) to play.

## Running from Shortcuts app

Expand All @@ -71,27 +47,30 @@ try {
}
// use the same bookmark name as in the action above
const filePath = fm.bookmarkedPath("ShortcutPDF");
await fm. downloadFileFromiCloud(filePath); // works also for local file

// We execute pdfjs in a WebView
const wv = new WebView();
const htmlFileUrl = fm.joinPath(fm.documentsDirectory(), "scriptable-pdfjs.html");
await fm.downloadFileFromiCloud(htmlFileUrl);
await wv.loadFile(htmlFileUrl);

let javascript = 'pdfjs.getPDFText(';
let javascript = 'pdfjs.getText(';
javascript += '"' + fm.read(filePath).toBase64String() + '"';
javascript += ', (pageText) => pageText.includes("(Long copy #1)")';
javascript += ', true';
//javascript += ', (pageText) => pageText.includes("(Long copy #1)")';
//javascript += ', true';
javascript += ');'


let result = "";
try {
result = await wv.evaluateJavaScript(javascript, true);
} catch (e) {
result = "";
}
Script.setShortcutOutput(result);
return result;
```
For some reasons... (bug in shortcuts or scriptable ?) You cannot convert
the PDF to Base64 and pass it as an argument to the script. You have to use
the bookmark trick and make the base64 conversion in Scriptable.

For file larger than 2.5 Mo, you can not run this script inline and you
have to modify the script to get results by using the clipboard.
56 changes: 39 additions & 17 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "scriptable-pdfjs",
"version": "1.0.0",
"version": "1.0.0-beta.1",
"description": "converts a pdf to text in ios app scriptable app.",
"main": "index.js",
"scripts": {
Expand Down
10 changes: 0 additions & 10 deletions src/index.html
Original file line number Diff line number Diff line change
@@ -1,10 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>scriptable-pdfjs</title>
</head>
<body>

</body>
</html>
38 changes: 26 additions & 12 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ if (typeof window !== 'undefined' && 'Worker' in window) {
* returns the text of a page
* @param pdf:pdfjs.Document
* @param pageNo:number
* @param separator:string=""
* @returns {Promise<string>}
*/
async function getPageText(pdf, pageNo) {
async function getPageText(pdf, pageNo, separator="") {
// noinspection JSUnresolvedFunction
const page = await pdf.getPage(pageNo);
const tokenizedText = await page.getTextContent();
return tokenizedText.items.map(token => token.str).join("");
return tokenizedText.items.map(token => token.str).join(separator);
}

/**
Expand All @@ -42,15 +43,17 @@ async function getPageText(pdf, pageNo) {
* @param source: pdfjs.Document - the pdf document
* @param matchFn:[function] - the match function
* @param breakAfter:[boolean=false] - if true, stop the search after failure
* @param pageSeparator:string="" - the page separator
* @param tokenSeparator:string="" - the token separator
* @returns {Promise<string>}
*/
async function extractPDFText(source, matchFn, breakAfter=false){
export async function extractText(source, matchFn, breakAfter=false, pageSeparator="", tokenSeparator=""){
const pdfPages = [];
let matchingPagesCount = 0;
const pdf = await pdfjs.getDocument(source).promise;
const maxPages = pdf.numPages;
for (let pageNo = 1; pageNo <= maxPages; pageNo += 1) {
const pageText = await getPageText(pdf, pageNo);
const pageText = await getPageText(pdf, pageNo, tokenSeparator);
if (matchFn) {
if (matchFn(pageText, pageNo, pdf)) {
pdfPages.push(pageText);
Expand All @@ -62,23 +65,34 @@ async function extractPDFText(source, matchFn, breakAfter=false){
pdfPages.push(pageText);
}
}
return pdfPages.join("");
return pdfPages.join(pageSeparator);
}

/**
* a wrapper to the extractPDFText with completion and error handling
* @param base64string:string - the pdf in base64 string format
* a convenience wrapper to extractText using Scriptable completion fn
* @param source:[string|object] - pdf source accepted by pdfjs.getDocument
* @param matchFn:[function] - optional text matching function
* @param breakAfter:[boolean=false]
* @param pageSeparator:string="" - the page separator
* @param tokenSeparator:string="" - the token separator
*/
export function getPDFText(base64string, matchFn, breakAfter=false ) {
extractPDFText(
{data: atob(base64string)},
matchFn,
breakAfter).then((text) => {
export function getText(source, matchFn, breakAfter=false, pageSeparator="", tokenSeparator = "" ) {
extractText(source, matchFn, breakAfter, pageSeparator, tokenSeparator)
.then((text) => {
completion(text)
}, (error) => {
throw Error(error);
});
}
/**
* a wrapper for getText when you need to use base64 string as source
* @param base64string:string - the pdf in base64 string format
* @param matchFn:[function] - optional text matching function
* @param breakAfter:[boolean=false]
* @param pageSeparator:string="" - the page separator
* @param tokenSeparator:string="" - the token separator
*/
export function getTextFromBase64String(base64string, matchFn, breakAfter=false, pageSeparator="", tokenSeparator = "") {
getText({data: atob(base64string)}, matchFn, breakAfter, pageSeparator, tokenSeparator);
}

2 changes: 1 addition & 1 deletion webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ const config = {
inlineSource: '.(js|css)$',
filename: `${infos.name}.html`,
}),
new HtmlWebpackInlineSourcePlugin(HtmlWebpackPlugin),
new HtmlWebpackInlineSourcePlugin(HtmlWebpackPlugin)
]
};

Expand Down

0 comments on commit 710b527

Please sign in to comment.