Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extroactor pdf 2 image #11909

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions api/core/workflow/nodes/document_extractor/entities.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from collections.abc import Sequence
from typing import Optional

from core.workflow.nodes.base import BaseNodeData


class DocumentExtractorNodeData(BaseNodeData):
variable_selector: Sequence[str]
output_image: Optional[bool] = False
69 changes: 65 additions & 4 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,24 @@
import os
import tempfile
from typing import cast
from uuid import uuid4

import docx
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore

from configs import dify_config
from core.file import File, FileTransferMethod, file_manager
from core.file import File, FileTransferMethod, FileType, file_manager
from core.helper import ssrf_proxy
from core.variables import ArrayFileSegment
from core.variables.segments import FileSegment
from core.workflow.entities.node_entities import NodeRunResult
from core.workflow.nodes.base import BaseNode
from core.workflow.nodes.enums import NodeType
from libs.login import current_user
from models.workflow import WorkflowNodeExecutionStatus
from services.file_service import FileService

from .entities import DocumentExtractorNodeData
from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
Expand All @@ -38,6 +41,8 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):

def _run(self):
variable_selector = self.node_data.variable_selector
output_image = self.node_data.output_image

variable = self.graph_runtime_state.variable_pool.get(variable_selector)

if variable is None:
Expand All @@ -50,23 +55,27 @@ def _run(self):
value = variable.value
inputs = {"variable_selector": variable_selector}
process_data = {"documents": value if isinstance(value, list) else [value]}

images = []
try:
if isinstance(value, list):
if output_image:
images = _extract_images_from_file(files=value)
extracted_text_list = list(map(_extract_text_from_file, value))
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
inputs=inputs,
process_data=process_data,
outputs={"text": extracted_text_list},
outputs={"text": extracted_text_list, "images": images},
)
elif isinstance(value, File):
if output_image:
images = _extract_images_from_file([value])
extracted_text = _extract_text_from_file(value)
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
inputs=inputs,
process_data=process_data,
outputs={"text": extracted_text},
outputs={"text": extracted_text, "images": images},
)
else:
raise DocumentExtractorError(f"Unsupported variable type: {type(value)}")
Expand Down Expand Up @@ -180,6 +189,58 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to extract text from PDF: {str(e)}") from e


def _extract_images_from_pdf(file: File) -> list[File]:
file_content = _download_file_content(file)
images = []
try:
pdf_file = pypdfium2.PdfDocument(file_content, autoclose=True)
for page in pdf_file:
page_bitmap = page.render(scale=5)
image = page_bitmap.to_pil()
byte_io = io.BytesIO()
image.save(byte_io, format="PNG")
img_bytes = byte_io.getvalue()
image_upload_file = FileService.upload_file(
content=img_bytes, user=current_user, mimetype="image/png", filename=f"{uuid4()}.png"
)
images.append(
File(
tenant_id=image_upload_file.tenant_id,
type=FileType.IMAGE,
transfer_method=FileTransferMethod.LOCAL_FILE,
remote_url=image_upload_file.source_url,
related_id=image_upload_file.id,
filename=image_upload_file.name,
extension=image_upload_file.extension,
mime_type=image_upload_file.mime_type,
size=image_upload_file.size,
storage_key=image_upload_file.key,
)
)

return images
except Exception as e:
raise Exception(f"Failed to convert PDF to images: {e}")


def _extract_images_from_file(files: list[File]):
try:
for file in files:
if file.extension:
if file.extension == ".pdf":
return _extract_images_from_pdf(file=file)
return []
elif file.mime_type:
if file.mime_type == "application/pdf":
return _extract_images_from_pdf(file=file)
return []
else:
raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
return []
except Exception as e:
raise TextExtractionError(f"Failed to extract image from PDF: {str(e)}") from e


def _extract_text_from_doc(file_content: bytes) -> str:
"""
Extract text from a DOC/DOCX file.
Expand Down
40 changes: 40 additions & 0 deletions web/app/components/base/radio-group/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
'use client'
import type { FC } from 'react'
import React from 'react'
import s from './style.module.css'
import cn from '@/utils/classnames'

type OPTION = {
label: string
value: any
}

type Props = {
className?: string
options: OPTION[]
value: any
onChange: (value: any) => void
}

const RadioGroup: FC<Props> = ({
className = '',
options,
value,
onChange,
}) => {
return (
<div className={cn(className, 'flex')}>
{options.map(item => (
<div
key={item.value}
className={cn(s.item, item.value === value && s.checked)}
onClick={() => onChange(item.value)}
>
<div className={s.radio}></div>
<div className='text-[13px] font-medium text-gray-900'>{item.label}</div>
</div>
))}
</div>
)
}
export default React.memo(RadioGroup)
24 changes: 24 additions & 0 deletions web/app/components/base/radio-group/style.module.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
.item {
@apply grow flex items-center h-8 px-2.5 rounded-lg bg-gray-25 border border-gray-100 cursor-pointer space-x-2;
}

.item:hover {
background-color: #ffffff;
border-color: #B2CCFF;
box-shadow: 0px 12px 16px -4px rgba(16, 24, 40, 0.08), 0px 4px 6px -2px rgba(16, 24, 40, 0.03);
}

.item.checked {
background-color: #ffffff;
border-color: #528BFF;
box-shadow: 0px 1px 2px 0px rgba(16, 24, 40, 0.06), 0px 1px 3px 0px rgba(16, 24, 40, 0.10);
}

.radio {
@apply w-4 h-4 border-[2px] border-gray-200 rounded-full;
}

.item.checked .radio {
border-width: 5px;
border-color: #155eef;
}
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ const formatItem = (
variable: 'text',
type: (data as DocExtractorNodeType).is_array_file ? VarType.arrayString : VarType.string,
},
{
variable: 'images',
type: VarType.arrayFile,
},
]
break
}
Expand Down
44 changes: 38 additions & 6 deletions web/app/components/workflow/nodes/document-extractor/panel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import Field from '@/app/components/workflow/nodes/_base/components/field'
import { BlockEnum, type NodePanelProps } from '@/app/components/workflow/types'
import I18n from '@/context/i18n'
import { LanguagesSupported } from '@/i18n/language'
import RadioGroup from '@/app/components/base/radio-group'

const i18nPrefix = 'workflow.nodes.docExtractor'

Expand Down Expand Up @@ -44,12 +45,13 @@ const Panel: FC<NodePanelProps<DocExtractorNodeType>> = ({
const {
readOnly,
inputs,
handleConfigChanges,
handleVarChanges,
filterVar,
} = useConfig(id, data)

return (
<div className='mt-2'>
<div className='pt-2'>
<div className='px-4 pb-4 space-y-4'>
<Field
title={t(`${i18nPrefix}.inputVar`)}
Expand All @@ -70,15 +72,45 @@ const Panel: FC<NodePanelProps<DocExtractorNodeType>> = ({
</div>
</>
</Field>
<Field
title={t(`${i18nPrefix}.output_format`)}
>
<>
<RadioGroup
className='space-x-3'
options={[
{
label: t(`${i18nPrefix}.output_text`),
value: 'text',
},
{
label: t(`${i18nPrefix}.output_image`),
value: 'image',
},
]}
value={inputs.output_image ? 'image' : 'text'}
onChange={val => handleConfigChanges({
output_image: val === 'image',
})}
/>
</>
</Field>
</div>
<Split />
<div>
<OutputVars>
<VarItem
name='text'
type={inputs.is_array_file ? 'array[string]' : 'string'}
description={t(`${i18nPrefix}.outputVars.text`)}
/>
<div className=' p-4'>
<VarItem
name='text'
type={inputs.is_array_file ? 'array[string]' : 'string'}
description={t(`${i18nPrefix}.outputVars.text`)}
/>
<VarItem
name='images'
type={'array[file]'}
description={t(`${i18nPrefix}.outputVars.images`)}
/>
</div>
</OutputVars>
</div>
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ import type { CommonNodeType, ValueSelector } from '@/app/components/workflow/ty
export type DocExtractorNodeType = CommonNodeType & {
variable_selector: ValueSelector
is_array_file: boolean
output_image: boolean
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { useCallback, useMemo } from 'react'
import produce from 'immer'
import { useStoreApi } from 'reactflow'

import type { ValueSelector, Var } from '../../types'
import type { DocumentExtractorConfig, ValueSelector, Var } from '../../types'
import { VarType } from '../../types'
import { type DocExtractorNodeType } from './types'
import useNodeCrud from '@/app/components/workflow/nodes/_base/hooks/use-node-crud'
Expand Down Expand Up @@ -55,11 +55,19 @@ const useConfig = (id: string, payload: DocExtractorNodeType) => {
setInputs(newInputs)
}, [getType, inputs, setInputs])

const handleConfigChanges = useCallback((config: DocumentExtractorConfig) => {
const newInputs = produce(inputs, (draft) => {
draft.output_image = config.output_image
})
setInputs(newInputs)
}, [inputs, setInputs])

return {
readOnly,
inputs,
filterVar,
handleVarChanges,
handleConfigChanges,
}
}

Expand Down
3 changes: 3 additions & 0 deletions web/app/components/workflow/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,6 @@ export type VisionSetting = {
variable_selector: ValueSelector
detail: Resolution
}
export type DocumentExtractorConfig = {
output_image: boolean
}
4 changes: 4 additions & 0 deletions web/i18n/en-US/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,12 @@ const translation = {
},
docExtractor: {
inputVar: 'Input Variable',
output_image: 'Image output (PDF only)',
output_text: 'Text output',
output_format: 'output format',
outputVars: {
text: 'Extracted text',
images: 'Extracted images',
},
supportFileTypes: 'Support file types: {{types}}.',
learnMore: 'Learn more',
Expand Down
4 changes: 4 additions & 0 deletions web/i18n/zh-Hans/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,12 @@ const translation = {
},
docExtractor: {
inputVar: '输入变量',
output_image: '图片输出(只支持PDF)',
output_text: '文本输出',
output_format: '输出类型',
outputVars: {
text: '提取的文本',
images: '提取的图片列表',
},
supportFileTypes: '支持的文件类型: {{types}}。',
learnMore: '了解更多',
Expand Down
6 changes: 5 additions & 1 deletion web/i18n/zh-Hant/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -664,8 +664,12 @@ const translation = {
addNote: '添加註釋',
},
docExtractor: {
output_image: '圖片輸出(只支持PDF)',
output_text: '文本輸出',
output_format: '輸出类型',
outputVars: {
text: '提取的文字',
text: '提取的文本',
images: '提取的圖片列表',
},
learnMore: '瞭解更多資訊',
inputVar: '輸入變數',
Expand Down
Loading