Skip to content
This repository has been archived by the owner on Feb 16, 2023. It is now read-only.

Commit

Permalink
added a very crude and largely untested API endpoint for document mer…
Browse files Browse the repository at this point in the history
…ging #335
  • Loading branch information
jonaswinkler committed Mar 11, 2021
1 parent 321869e commit 1e131f0
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 62 deletions.
145 changes: 108 additions & 37 deletions src/documents/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import tempfile

from django.conf import settings
from django_q.tasks import async_task

from documents import tasks
from documents.consumer import ConsumerError
from documents.models import Document

from pikepdf import Pdf
Expand All @@ -12,55 +15,123 @@ class MergeError(Exception):
pass


def execute_split_merge_plan(plan, metadata: str, delete_source: bool, preview: bool):
class PdfCache:

temp_dir = tempfile.mkdtemp(prefix="paperless-merge", dir=settings.SCRATCH_DIR)
def __init__(self):
self.cache = dict()

target_files = []
def open_from_document(self, document: Document):
if document.pk in self.cache:
return self.cache[document.pk]

for (i, target_document_spec) in enumerate(plan):
if document.mime_type == 'application/pdf':
filename = document.source_path
elif document.has_archive_version:
filename = document.archive_path
else:
raise MergeError()

# create a new document from documents in target_document_spec
if not os.path.exists(filename):
raise MergeError()

target_pdf: Pdf = Pdf.new()
pdf = Pdf.open(filename)
self.cache[document.pk] = pdf

for source_document_spec in target_document_spec:
return pdf

source_document_id = source_document_spec['document']
def close_all(self):
for pk in self.cache:
self.cache[pk].close()

if 'pages' in source_document_spec:
pages = source_document_spec['pages']
else:
pages = None
self.cache.clear()

try:
source_document: Document = Document.objects.get(id=source_document_id)
except Document.DoesNotExist:
raise MergeError()

if source_document.mime_type == 'application/pdf':
source_pdf: Pdf = Pdf.open(source_document.source_path)
elif source_document.has_archive_version:
source_pdf: Pdf = Pdf.open(source_document.archive_path)
else:
raise MergeError()
def consume_many_files(kwargs_list, delete_document_ids=None):
new_document_ids = []

if pages is not None:
for page in pages:
if page >= len(source_pdf.pages):
raise MergeError()
target_pdf.pages.append(source_pdf.pages[page])
else:
target_pdf.pages.extend(source_pdf.pages)
try:
for kwargs in kwargs_list:
new_document_ids.append(tasks.consume_file(**kwargs))

target_pdf_filename = os.path.join(temp_dir, f"{i+1:02}.pdf")
target_pdf.save(target_pdf_filename)
target_files.append(target_pdf_filename)
except ConsumerError:
# in case something goes wrong, delete all previously created documents
for document_id in new_document_ids:
Document.objects.get(id=document_id).delete()
raise
else:
# If everything goes well, optionally delete source documents
if delete_document_ids:
for document_id in delete_document_ids:
Document.objects.get(id=document_id).delete()

if not preview:
pass

if delete_source:
pass
def execute_split_merge_plan(plan, tempdir: str, metadata: str = "redo", delete_source: bool = False, preview: bool = True):

consume_tasks = []
cache = PdfCache()
source_documents = set()

try:
for (i, target_document_spec) in enumerate(plan):
# create a new document from documents in target_document_spec

target_pdf: Pdf = Pdf.new()
version = target_pdf.pdf_version

for source_document_spec in target_document_spec:
source_document_id = source_document_spec['document']
source_documents.add(source_document_id)

if 'pages' in source_document_spec:
pages = source_document_spec['pages']
else:
pages = None

try:
source_document: Document = Document.objects.get(id=source_document_id)
except Document.DoesNotExist:
raise MergeError()

source_pdf: Pdf = cache.open_from_document(source_document)
version = max(version, source_pdf.pdf_version)

if pages is not None:
for page in pages:
if page >= len(source_pdf.pages):
raise MergeError()
target_pdf.pages.append(source_pdf.pages[page])
else:
target_pdf.pages.extend(source_pdf.pages)

target_pdf_filename = tempfile.NamedTemporaryFile(suffix="_pdf", dir=tempdir).name
target_pdf.remove_unreferenced_resources()
target_pdf.save(target_pdf_filename, min_version=version)
target_pdf.close()

consume_task = {"path": target_pdf_filename}

first_id = target_document_spec[0]["document"]
first_doc: Document = Document.objects.get(id=first_id)

consume_task["override_title"] = first_doc.title

if metadata == "copy_first":
if first_doc.correspondent:
consume_task["override_correspondent_id"] = first_doc.correspondent.id
if first_doc.document_type:
consume_task["override_document_type_id"] = first_doc.document_type.hidden
if first_doc.tags.count() > 0:
consume_task["override_tag_ids"] = [tag.id for tag in first_doc.tags]

consume_tasks.append(consume_task)
finally:
cache.close_all()

if not preview:
async_task(
"documents.merge.consume_many_files",
kwargs_list=consume_tasks,
delete_document_ids=list(source_documents) if delete_source else None
)

return target_files
return [os.path.basename(t["path"]) for t in consume_tasks]
4 changes: 1 addition & 3 deletions src/documents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ def consume_file(path,
)

if document:
return "Success. New document id {} created".format(
document.pk
)
return document.pk
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")
Expand Down
53 changes: 38 additions & 15 deletions src/documents/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime
from time import mktime

import pathvalidate
from django.conf import settings
from django.db.models import Count, Max, Case, When, IntegerField
from django.db.models.functions import Lower
Expand All @@ -18,6 +19,7 @@
from rest_framework import parsers
from rest_framework.decorators import action
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.generics import GenericAPIView
from rest_framework.mixins import (
DestroyModelMixin,
ListModelMixin,
Expand Down Expand Up @@ -673,28 +675,49 @@ def post(self, request, format=None):
return response


class DocumentMergeView(APIView):
class DocumentSplitMergeViewSet(GenericViewSet):

permission_classes = (IsAuthenticated,)
serializer_class = DocumentSplitMergePlanSerializer
parser_classes = (parsers.JSONParser,)

def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def __init__(self, **kwargs):
super(DocumentSplitMergeViewSet, self).__init__(**kwargs)
self.tempdir = os.path.join(settings.SCRATCH_DIR, "paperless-split-merge")
os.makedirs(self.tempdir, exist_ok=True)

def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def get_queryset(self):
return os.listdir(self.tempdir)

def post(self, request, *args, **kwargs):
def retrieve(self, request, pk, *args, **kwargs):
filename = os.path.join(self.tempdir, pathvalidate.sanitize_filename(pk))
if not os.path.isfile(filename):
raise Http404()

with open(filename, "rb") as f:
return HttpResponse(f, content_type="application/pdf")

def list(self, request, *args, **kwargs):
return Response(self.get_queryset())

def create(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)

# merge_plan = serializer.validated_data.get("merge_plan")
# preview = serializer.validated_data.get("preview")
from .merge import execute_split_merge_plan, MergeError

split_merge_plan = serializer.validated_data.get("split_merge_plan")
preview = serializer.validated_data.get("preview")
delete_source = serializer.validated_data.get("delete_source")
metadata = serializer.validated_data.get("metadata")
try:
pdf_files = execute_split_merge_plan(
plan=split_merge_plan,
preview=preview,
delete_source=delete_source,
metadata=metadata,
tempdir=self.tempdir
)
except MergeError:
raise

return Response("Not implemented yet")
return Response(pdf_files)
15 changes: 8 additions & 7 deletions src/paperless/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,23 @@
BulkEditView,
SelectionDataView,
BulkDownloadView,
DocumentMergeView
DocumentSplitMergeViewSet
)
from paperless.views import FaviconView

api_router = DefaultRouter()

api_router.register(r"documents", DocumentViewSet)
api_router.register(r"correspondents", CorrespondentViewSet)
api_router.register(r"document_types", DocumentTypeViewSet)
api_router.register(r"documents", DocumentViewSet)
api_router.register(r"logs", LogViewSet, basename="logs")
api_router.register(r"tags", TagViewSet)

api_router.register(r"logs", LogViewSet, basename="logs")

api_router.register(r"saved_views", SavedViewViewSet)

api_router.register(r"split_merge", DocumentSplitMergeViewSet, basename="split_merge")


urlpatterns = [
re_path(r"^api/", include([
Expand Down Expand Up @@ -68,10 +73,6 @@
re_path(r"^documents/bulk_download/", BulkDownloadView.as_view(),
name="bulk_download"),

re_path(r"^documents/merge/",
DocumentMergeView.as_view(),
name="merge"),

path('token/', views.obtain_auth_token)

] + api_router.urls)),
Expand Down

0 comments on commit 1e131f0

Please sign in to comment.