From 1fcb7d8e083c9077bcb6b7159fc4bfb2efd88726 Mon Sep 17 00:00:00 2001 From: rathan-muralidhar Date: Thu, 14 Dec 2023 14:51:45 +0530 Subject: [PATCH] Revert "Merge pull request #852 from project-anuvaad/develop" This reverts commit 845297e22c45fc809957eeb74e0b02a0da469578, reversing changes made to 4a0660c72fc9355a7d1e68d6036196f576a86928. --- .../ocr/tesseract_ulca_v2/README.md | 58 +++++-------------- .../ocr/tesseract_ulca_v2/start.sh | 22 +++---- 2 files changed, 25 insertions(+), 55 deletions(-) diff --git a/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/README.md b/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/README.md index 4a80a8880..a6c463df5 100644 --- a/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/README.md +++ b/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/README.md @@ -1,59 +1,29 @@ -# Anuvaad OCR -Open source OCR models for Indic Languages (Printed), developed and used as part of project Anuvaad. -Repo contains tesseract service with REST interface, which is ULCA compliant: +A tesseract service with rest interface: input : image url ouput : [sentences] Hindi and Tamil use custom weights -detection of language and downloads tess-best weights if not already avilable +detection of language and downloading tess-best weights if not already avilable -**Sample curl** : +sample curl : - - - curl --location 'http://localhost:5000/anuvaad/ocr/v0/ulca-ocr' \ - --header 'Content-Type: application/json' \ - --data '{ - "image" : [ - { - "imageUri": "https://anuvaad-raw-datasets.s3-us-west-2.amazonaws.com/anuvaad_ocr_hindi.jpg" - } - ], - "config": { - "languages": [{ - "sourceLanguage" : "hi" - }] - } - }' -' - -**Sample Response:** -```json -{ - "output" : [ - { - "source" : "बिपिन रावत का एक माचिस की डिबिया के कारण हुआ था" +curl --location --request POST 'http://0.0.0.0:5000/anuvaad/ocr/v0/ulca-ocr' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "config": { + "language": { + "sourceLanguage": "en" } - ], - "status" : { - "statusCode" : 200 , - "message" : "success" + }, + "imageUri": ["https://anuvaad-raw-datasets.s3-us-west-2.amazonaws.com/anuvaad_ocr_english.jpg" + + ] } -} - -``` -**Deployment** -## **Deployment** - - -```shell +' -docker build -t anuvaad_ocr_ulca_v2 . -docker run --name anuvaad_ocr_ulca_v2 -d --network host anuvaad_ocr_ulca_v2 -``` diff --git a/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/start.sh b/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/start.sh index d3a61c666..209fe89b5 100644 --- a/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/start.sh +++ b/anuvaad-etl/anuvaad-extractor/document-processor/ocr/tesseract_ulca_v2/start.sh @@ -13,32 +13,32 @@ curl -L -o /usr/share/tesseract-ocr/4.00/tessdata/Gujarati.traineddata https://g curl -L -o /usr/share/tesseract-ocr/4.00/tessdata/Oriya.traineddata https://github.com/tesseract-ocr/tessdata_best/blob/main/script/Oriya.traineddata?raw=true tam_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_tam.traineddata' -#url_tam='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_tam.traineddata' -url_tam='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvad_tam_scene_text_real.traineddata' +#url_tam='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_tam.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=X6%2BwKdeOyOUFlOFs%2B7eRmzhziZ0%3D&Expires=1693557258' +url_tam='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvad_tam_scene_text_real.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=J1NEp22bhsW7dO3kd8iN1VX7XtI%3D&Expires=1711538482' hin_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_hin.traineddata' hin_scene_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvad_hin_scene_text_real.traineddata' -url_hin='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_hin.traineddata' -url_hin_scene='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvad_hin_scene_text_real.traineddata' +url_hin='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_hin.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=2l%2F0OwWQrD%2FIvogfijATPufjMLA%3D&Expires=1693557740' +url_hin_scene='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvad_hin_scene_text_real.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=FZ6Whiiv8uTYDkPGUvMzqoOKPOI%3D&Expires=1709212126' kan_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_kan.traineddata' -url_kan='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_kan.traineddata' +url_kan='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_kan.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=gDiNsqrV0n2%2BWZSMwesyqkLOYZ8%3D&Expires=1694149503' ben_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_ben.traineddata' -url_ben='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_ben.traineddata' +url_ben='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_ben.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=ku%2FdynTtJVvaf55dwYC%2FMt3pKqo%3D&Expires=1698743313' mal_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_mal.traineddata' -url_mal='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_mal.traineddata' +url_mal='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_mal.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=hX%2Bo%2BTTvwoN7IBcX%2FIgFTwMHoGs%3D&Expires=1698743610' mar_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_mar.traineddata' -url_mar='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_mar.traineddata' +url_mar='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_mar.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=aTu5Ps9hL90clfPMZIVOEPx5%2Fl0%3D&Expires=1698743699' ori_modelpath='/usr/share/tesseract-ocr/4.00/tessdata/anuvaad_ori.traineddata' -url_ori='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_ori.traineddata' +url_ori='https://anuvaad-pubnet-weights.s3.amazonaws.com/anuvaad_ori.traineddata?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=5aqEjjOryEhE4ElV2i8oHgVY%2F7I%3D&Expires=1698743792' scene_text_line_detection_modelpath='./src/utilities/primalinenet/scene_text_judgement_line_detection_v1_model.pth' -url_scene_text_line_detection_modelpath='https://anuvaad-pubnet-weights.s3.amazonaws.com/scene_text_judgement_line_detection_v1_model.pth' +url_scene_text_line_detection_modelpath='https://anuvaad-pubnet-weights.s3.amazonaws.com/scene_text_judgement_line_detection_v1_model.pth?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=zTv5bP4Pt6NoLN%2FLUC7JrLBBrxs%3D&Expires=1705824951' scene_text_east_angle_detection_modelpath='./src/utilities/east/east-model.ckpt-49491.data-00000-of-00001' -url_scene_text_east_angle_detection_modelpath='https://anuvaad-pubnet-weights.s3.amazonaws.com/east-model.ckpt-49491.data-00000-of-00001' +url_scene_text_east_angle_detection_modelpath='https://anuvaad-pubnet-weights.s3.amazonaws.com/east-model.ckpt-49491.data-00000-of-00001?AWSAccessKeyId=AKIAXX2AMEIRJY2GNYVZ&Signature=XbR8OnEhYISllPYYuYkzFhmovUY%3D&Expires=1707278033'