langgenius · ic-xu · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/api/core/workflow/nodes/document_extractor/entities.py b/api/core/workflow/nodes/document_extractor/entities.py
@@ -1,7 +1,9 @@
 from collections.abc import Sequence
+from typing import Optional
 
 from core.workflow.nodes.base import BaseNodeData
 
 
 class DocumentExtractorNodeData(BaseNodeData):
     variable_selector: Sequence[str]
+    output_image: Optional[bool] = False
diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
@@ -5,21 +5,24 @@
 import os
 import tempfile
 from typing import cast
+from uuid import uuid4
 
 import docx
 import pandas as pd
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
 
 from configs import dify_config
-from core.file import File, FileTransferMethod, file_manager
+from core.file import File, FileTransferMethod, FileType, file_manager
 from core.helper import ssrf_proxy
 from core.variables import ArrayFileSegment
 from core.variables.segments import FileSegment
 from core.workflow.entities.node_entities import NodeRunResult
 from core.workflow.nodes.base import BaseNode
 from core.workflow.nodes.enums import NodeType
+from libs.login import current_user
 from models.workflow import WorkflowNodeExecutionStatus
+from services.file_service import FileService
 
 from .entities import DocumentExtractorNodeData
 from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
@@ -38,6 +41,8 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
 
     def _run(self):
         variable_selector = self.node_data.variable_selector
+        output_image = self.node_data.output_image
+
         variable = self.graph_runtime_state.variable_pool.get(variable_selector)
 
         if variable is None:
@@ -50,23 +55,27 @@ def _run(self):
         value = variable.value
         inputs = {"variable_selector": variable_selector}
         process_data = {"documents": value if isinstance(value, list) else [value]}
-
+        images = []
         try:
             if isinstance(value, list):
+                if output_image:
+                    images = _extract_images_from_file(files=value)
                 extracted_text_list = list(map(_extract_text_from_file, value))
                 return NodeRunResult(
                     status=WorkflowNodeExecutionStatus.SUCCEEDED,
                     inputs=inputs,
                     process_data=process_data,
-                    outputs={"text": extracted_text_list},
+                    outputs={"text": extracted_text_list, "images": images},
                 )
             elif isinstance(value, File):
+                if output_image:
+                    images = _extract_images_from_file([value])
                 extracted_text = _extract_text_from_file(value)
                 return NodeRunResult(
                     status=WorkflowNodeExecutionStatus.SUCCEEDED,
                     inputs=inputs,
                     process_data=process_data,
-                    outputs={"text": extracted_text},
+                    outputs={"text": extracted_text, "images": images},
                 )
             else:
                 raise DocumentExtractorError(f"Unsupported variable type: {type(value)}")
@@ -180,6 +189,58 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
         raise TextExtractionError(f"Failed to extract text from PDF: {str(e)}") from e
 
 
+def _extract_images_from_pdf(file: File) -> list[File]:
+    file_content = _download_file_content(file)
+    images = []
+    try:
+        pdf_file = pypdfium2.PdfDocument(file_content, autoclose=True)
+        for page in pdf_file:
+            page_bitmap = page.render(scale=5)
+            image = page_bitmap.to_pil()
+            byte_io = io.BytesIO()
+            image.save(byte_io, format="PNG")
+            img_bytes = byte_io.getvalue()
+            image_upload_file = FileService.upload_file(
+                content=img_bytes, user=current_user, mimetype="image/png", filename=f"{uuid4()}.png"
+            )
+            images.append(
+                File(
+                    tenant_id=image_upload_file.tenant_id,
+                    type=FileType.IMAGE,
+                    transfer_method=FileTransferMethod.LOCAL_FILE,
+                    remote_url=image_upload_file.source_url,
+                    related_id=image_upload_file.id,
+                    filename=image_upload_file.name,
+                    extension=image_upload_file.extension,
+                    mime_type=image_upload_file.mime_type,
+                    size=image_upload_file.size,
+                    storage_key=image_upload_file.key,
+                )
+            )
+
+        return images
+    except Exception as e:
+        raise Exception(f"Failed to convert PDF to images: {e}")
+
+
+def _extract_images_from_file(files: list[File]):
+    try:
+        for file in files:
+            if file.extension:
+                if file.extension == ".pdf":
+                    return _extract_images_from_pdf(file=file)
+                return []
+            elif file.mime_type:
+                if file.mime_type == "application/pdf":
+                    return _extract_images_from_pdf(file=file)
+                return []
+            else:
+                raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
+        return []
+    except Exception as e:
+        raise TextExtractionError(f"Failed to extract image from PDF: {str(e)}") from e
+
+
 def _extract_text_from_doc(file_content: bytes) -> str:
     """
     Extract text from a DOC/DOCX file.

diff --git a/web/app/components/base/radio-group/index.tsx b/web/app/components/base/radio-group/index.tsx
@@ -0,0 +1,40 @@
+'use client'
+import type { FC } from 'react'
+import React from 'react'
+import s from './style.module.css'
+import cn from '@/utils/classnames'
+
+type OPTION = {
+  label: string
+  value: any
+}
+
+type Props = {
+  className?: string
+  options: OPTION[]
+  value: any
+  onChange: (value: any) => void
+}
+
+const RadioGroup: FC<Props> = ({
+  className = '',
+  options,
+  value,
+  onChange,
+}) => {
+  return (
+    <div className={cn(className, 'flex')}>
+      {options.map(item => (
+        <div
+          key={item.value}
+          className={cn(s.item, item.value === value && s.checked)}
+          onClick={() => onChange(item.value)}
+        >
+          <div className={s.radio}></div>
+          <div className='text-[13px] font-medium text-gray-900'>{item.label}</div>
+        </div>
+      ))}
+    </div>
+  )
+}
+export default React.memo(RadioGroup)
diff --git a/web/app/components/base/radio-group/style.module.css b/web/app/components/base/radio-group/style.module.css
@@ -0,0 +1,24 @@
+.item {
+  @apply grow flex items-center h-8 px-2.5 rounded-lg bg-gray-25 border border-gray-100 cursor-pointer space-x-2;
+}
+
+.item:hover {
+  background-color: #ffffff;
+  border-color: #B2CCFF;
+  box-shadow: 0px 12px 16px -4px rgba(16, 24, 40, 0.08), 0px 4px 6px -2px rgba(16, 24, 40, 0.03);
+}
+
+.item.checked {
+  background-color: #ffffff;
+  border-color: #528BFF;
+  box-shadow: 0px 1px 2px 0px rgba(16, 24, 40, 0.06), 0px 1px 3px 0px rgba(16, 24, 40, 0.10);
+}
+
+.radio {
+  @apply w-4 h-4 border-[2px] border-gray-200 rounded-full;
+}
+
+.item.checked .radio {
+  border-width: 5px;
+  border-color: #155eef;
+}
diff --git a/web/app/components/workflow/nodes/_base/components/variable/utils.ts b/web/app/components/workflow/nodes/_base/components/variable/utils.ts
@@ -268,6 +268,10 @@ const formatItem = (
           variable: 'text',
           type: (data as DocExtractorNodeType).is_array_file ? VarType.arrayString : VarType.string,
         },
+        {
+          variable: 'images',
+          type: VarType.arrayFile,
+        },
       ]
       break
     }

diff --git a/web/app/components/workflow/nodes/document-extractor/panel.tsx b/web/app/components/workflow/nodes/document-extractor/panel.tsx
@@ -14,6 +14,7 @@ import Field from '@/app/components/workflow/nodes/_base/components/field'
 import { BlockEnum, type NodePanelProps } from '@/app/components/workflow/types'
 import I18n from '@/context/i18n'
 import { LanguagesSupported } from '@/i18n/language'
+import RadioGroup from '@/app/components/base/radio-group'
 
 const i18nPrefix = 'workflow.nodes.docExtractor'
 
@@ -44,12 +45,13 @@ const Panel: FC<NodePanelProps<DocExtractorNodeType>> = ({
   const {
     readOnly,
     inputs,
+    handleConfigChanges,
     handleVarChanges,
     filterVar,
   } = useConfig(id, data)
 
   return (
-    <div className='mt-2'>
+    <div className='pt-2'>
       <div className='px-4 pb-4 space-y-4'>
         <Field
           title={t(`${i18nPrefix}.inputVar`)}
@@ -70,15 +72,45 @@ const Panel: FC<NodePanelProps<DocExtractorNodeType>> = ({
             </div>
           </>
         </Field>
+        <Field
+          title={t(`${i18nPrefix}.output_format`)}
+        >
+          <>
+            <RadioGroup
+              className='space-x-3'
+              options={[
+                {
+                  label: t(`${i18nPrefix}.output_text`),
+                  value: 'text',
+                },
+                {
+                  label: t(`${i18nPrefix}.output_image`),
+                  value: 'image',
+                },
+              ]}
+              value={inputs.output_image ? 'image' : 'text'}
+              onChange={val => handleConfigChanges({
+                output_image: val === 'image',
+              })}
+            />
+          </>
+        </Field>
       </div>
       <Split />
       <div>
         <OutputVars>
-          <VarItem
-            name='text'
-            type={inputs.is_array_file ? 'array[string]' : 'string'}
-            description={t(`${i18nPrefix}.outputVars.text`)}
-          />
+          <div className=' p-4'>
+            <VarItem
+              name='text'
+              type={inputs.is_array_file ? 'array[string]' : 'string'}
+              description={t(`${i18nPrefix}.outputVars.text`)}
+            />
+            <VarItem
+              name='images'
+              type={'array[file]'}
+              description={t(`${i18nPrefix}.outputVars.images`)}
+            />
+          </div>
         </OutputVars>
       </div>
     </div>

diff --git a/web/app/components/workflow/nodes/document-extractor/types.ts b/web/app/components/workflow/nodes/document-extractor/types.ts
@@ -3,4 +3,5 @@ import type { CommonNodeType, ValueSelector } from '@/app/components/workflow/ty
 export type DocExtractorNodeType = CommonNodeType & {
   variable_selector: ValueSelector
   is_array_file: boolean
+  output_image: boolean
 }
diff --git a/web/app/components/workflow/nodes/document-extractor/use-config.ts b/web/app/components/workflow/nodes/document-extractor/use-config.ts
@@ -2,7 +2,7 @@ import { useCallback, useMemo } from 'react'
 import produce from 'immer'
 import { useStoreApi } from 'reactflow'
 
-import type { ValueSelector, Var } from '../../types'
+import type { DocumentExtractorConfig, ValueSelector, Var } from '../../types'
 import { VarType } from '../../types'
 import { type DocExtractorNodeType } from './types'
 import useNodeCrud from '@/app/components/workflow/nodes/_base/hooks/use-node-crud'
@@ -55,11 +55,19 @@ const useConfig = (id: string, payload: DocExtractorNodeType) => {
     setInputs(newInputs)
   }, [getType, inputs, setInputs])
 
+  const handleConfigChanges = useCallback((config: DocumentExtractorConfig) => {
+    const newInputs = produce(inputs, (draft) => {
+      draft.output_image = config.output_image
+    })
+    setInputs(newInputs)
+  }, [inputs, setInputs])
+
   return {
     readOnly,
     inputs,
     filterVar,
     handleVarChanges,
+    handleConfigChanges,
   }
 }
 

diff --git a/web/app/components/workflow/types.ts b/web/app/components/workflow/types.ts
@@ -396,3 +396,6 @@ export type VisionSetting = {
   variable_selector: ValueSelector
   detail: Resolution
 }
+export type DocumentExtractorConfig = {
+  output_image: boolean
+}
diff --git a/web/i18n/en-US/workflow.ts b/web/i18n/en-US/workflow.ts
@@ -671,8 +671,12 @@ const translation = {
     },
     docExtractor: {
       inputVar: 'Input Variable',
+      output_image: 'Image output (PDF only)',
+      output_text: 'Text output',
+      output_format: 'output format',
       outputVars: {
         text: 'Extracted text',
+        images: 'Extracted images',
       },
       supportFileTypes: 'Support file types: {{types}}.',
       learnMore: 'Learn more',

diff --git a/web/i18n/zh-Hans/workflow.ts b/web/i18n/zh-Hans/workflow.ts
@@ -671,8 +671,12 @@ const translation = {
     },
     docExtractor: {
       inputVar: '输入变量',
+      output_image: '图片输出（只支持PDF）',
+      output_text: '文本输出',
+      output_format: '输出类型',
       outputVars: {
         text: '提取的文本',
+        images: '提取的图片列表',
       },
       supportFileTypes: '支持的文件类型: {{types}}。',
       learnMore: '了解更多',

diff --git a/web/i18n/zh-Hant/workflow.ts b/web/i18n/zh-Hant/workflow.ts
@@ -664,8 +664,12 @@ const translation = {
       addNote: '添加註釋',
     },
     docExtractor: {
+      output_image: '圖片輸出（只支持PDF）',
+      output_text: '文本輸出',
+      output_format: '輸出类型',
       outputVars: {
-        text: '提取的文字',
+        text: '提取的文本',
+        images: '提取的圖片列表',
       },
       learnMore: '瞭解更多資訊',
       inputVar: '輸入變數',