Merge branch 'mod-for-making-package-with-pyinstaller'

ryogrid · Oct 17, 2024 · fb2519e · fb2519e
2 parents 7813aee + 6401d32
commit fb2519e
Show file tree

Hide file tree

Showing 16 changed files with 907 additions and 745 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -2,16 +2,9 @@
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
 name: Python application
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    branches: [ "main" ]
-
+on: [push, pull_request]
 permissions:
   contents: read
-
 jobs:
   build:
     runs-on: ubuntu-latest
@@ -29,4 +22,4 @@ jobs:
     - name: Mypy Check
       uses: jpetrucciani/mypy-check@master
       with:
-        path: .
+        path: .
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,5 @@
+[global]
+developmentMode = false
+
+[server]
+port = 8501
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@
   - Python 3.10.4
   - pip 22.0.4
 - $ pip install -r requirements.txt 
-- $ python make-tags-with-wd-tagger.py --dir "IMAGE FILES CONTAINED DIR PATH"
+- $ python tagging.py --dir "IMAGE FILES CONTAINED DIR PATH"
   - The script searches directory structure recursively :)
   - This takes quite a while...
     - About 0.5 sec/file at middle spec desktop PC (GPU is not used)
@@ -28,18 +28,20 @@
       - Plese see [here](https://onnxruntime.ai/docs/execution-providers/)
       - Performance key is processing speed of ONNX Runtime at your machine :)
   - Image files and tags of these are saved to tags-wd-tagger.txt
-- $ python count-unique-tag-num.py
+- $ python counttag.py
   - => for deciding appropriate dimension scale fitting your data
-- $ python gen-lsi-model.py
-  - **Please edit [num_topics paramater](https://github.com/ryogrid/local-illust-image-searcher/blob/main/gen-lsi-model.py#L51) before execution**
-  - I think about 80% of unique tags which is counted with count-unique-tag-num.py is better
-    - EX: unique tags count is 1000 -> 0.8 * 1000 -> 800 num_topics (dimension)
+  - unique tag count is shown
+- $ python genmodel.py --dim MODEL_DIMENSION
+  - MODEL_DIMENSION is integer which specify dimension of latent sementic representation
+    - Dimension after applying LSI
+  - I think that 80% of unique tags which is counted with counttag.py is better
+    - EX: unique tags count is 1000 -> 0.8 * 1000 -> 800 (dimension)
   - This takes quite a while...
     - LSI processing: dimension reduction with [Singular Value Decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition)
     - Take several secs only for 1000 files and reduction from 800 to 700 dimension case (case of demo on later section)
     - But, in 340k files and from 7500 to 6000 dimension case, about 3.5 hour are taken
       - files are not for demo :)
-- $ streamlit run web-ui-image-search-lsi.py
+- $ streamlit run webui.py
   - Search app is opend on your web browser
 
 ## Tips (Attention)

diff --git a/cmd_run.py b/cmd_run.py
@@ -0,0 +1,27 @@
+import argparse
+import sys
+
+import tagging
+import genmodel
+import counttag
+
+def main() -> None:
+    parser: argparse.ArgumentParser = argparse.ArgumentParser()
+    parser.add_argument('command', nargs=1, help='command to run')
+    # dummy
+    parser.add_argument('--dir', nargs=1, help='')
+    # dummy
+    parser.add_argument('--dim', nargs=1, type=int, help='')
+    args: argparse.Namespace = parser.parse_args()
+
+    if args.command[0] == 'tagging':
+        tagging.main(sys.argv[2:])
+    elif args.command[0] == 'genmodel':
+        genmodel.main(sys.argv[2:])
+    elif args.command[0] == 'counttag':
+        counttag.main()
+    else:
+        print('Invalid command')
+        exit(1)
+
+main()
diff --git a/count-unique-tag-num.py b/count-unique-tag-num.py
diff --git a/counttag.py b/counttag.py
@@ -0,0 +1,16 @@
+# -- coding: utf-8 --
+
+from typing import Dict, List
+
+def main() -> None:
+    tag_map: Dict[str, bool] = {}
+    with open('tags-wd-tagger.txt', 'r', encoding='utf-8') as f:
+        for line in f:
+            tags: List[str] = line.strip().split(',')
+            tags = tags[1:-1]
+            for tag in tags:
+                tag_map[tag] = True
+    print(f'{len(tag_map)} unique tags found')
+
+if __name__ == '__main__':
+    main()
diff --git a/gen-lsi-model.py → genmodel.py b/gen-lsi-model.py → genmodel.py
@@ -1,70 +1,74 @@
-from gensim import corpora
-from gensim.models import LsiModel
-from gensim.similarities import MatrixSimilarity
-from gensim.utils import simple_preprocess
-import pickle
-from typing import List, Tuple
-import logging
-
-# generate corpus for gensim and index text file for search tool
-def read_documents_and_gen_idx_text(file_path: str) -> List[List[str]]:
-    corpus_base: List[List[str]] = []
-    idx_text_fpath: str = file_path.split('.')[0] + '_lsi_idx.csv'
-    with open(idx_text_fpath, 'w', encoding='utf-8') as idx_f:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            for line in f:
-                row: List[str] = line.split(",")
-                # remove file path element
-                row = row[1:]
-                # # remove last element
-                # row = row[:-1]
-
-                # join tags with space for gensim
-                tags_line: str = ' '.join(row)
-                # tokens: List[str] = simple_preprocess(tags_line.strip())
-                tokens: List[str] = row
-                # ignore simple_preprocess failure case and short tags image
-                if tokens and len(tokens) >= 3:
-                    corpus_base.append(tokens)
-                    idx_f.write(line)
-                    idx_f.flush()
-    return corpus_base
-
-# read image files pathes from file
-def read_documents(filename: str) -> List[str]:
-    with open(filename, 'r', encoding='utf-8') as file:
-        documents: List[str] = [line.strip() for line in file.readlines()]
-    return documents
-
-def main() -> None:
-    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    logging.basicConfig(
-        format=format_str,
-        level=logging.DEBUG
-    )
-
-    processed_docs: List[List[str]] = read_documents_and_gen_idx_text('tags-wd-tagger.txt')
-
-    # image file => doc_id
-    dictionary: corpora.Dictionary = corpora.Dictionary(processed_docs)
-    # remove frequent tags
-    #dictionary.filter_n_most_frequent(500)
-
-    with open('lsi_dictionary', 'wb') as f:
-        pickle.dump(dictionary, f)
-
-    corpus: List[List[Tuple[int, int]]] = [dictionary.doc2bow(doc) for doc in processed_docs]
-
-    # gen LSI model with specified number of topics (dimensions)
-    # ATTENTION: num_topics should be set to appropriate value!!!
-    lsi_model: LsiModel = LsiModel(corpus, id2word=dictionary, num_topics=800)
-
-    lsi_model.save("lsi_model")
-
-    # make similarity index
-    index: MatrixSimilarity = MatrixSimilarity(lsi_model[corpus])
-
-    index.save("lsi_index")
-
-if __name__ == "__main__":
-    main()
+import argparse
+import sys
+
+from gensim import corpora
+from gensim.models import LsiModel
+from gensim.similarities import MatrixSimilarity
+import pickle
+from typing import List, Tuple
+import logging
+
+# generate corpus for gensim and index text file for search tool
+def read_documents_and_gen_idx_text(file_path: str) -> List[List[str]]:
+    corpus_base: List[List[str]] = []
+    idx_text_fpath: str = file_path.split('.')[0] + '_lsi_idx.csv'
+    with open(idx_text_fpath, 'w', encoding='utf-8') as idx_f:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                row: List[str] = line.split(",")
+                # remove file path element
+                row = row[1:]
+
+                # tokens: List[str] = simple_preprocess(tags_line.strip())
+                tokens: List[str] = row
+                # ignore simple_preprocess failure case and short tags image
+                if tokens and len(tokens) >= 3:
+                    corpus_base.append(tokens)
+                    idx_f.write(line)
+                    idx_f.flush()
+
+    return corpus_base
+
+# read image files pathes from file
+def read_documents(filename: str) -> List[str]:
+    with open(filename, 'r', encoding='utf-8') as file:
+        documents: List[str] = [line.strip() for line in file.readlines()]
+    return documents
+
+def main(arg_str: list[str]) -> None:
+    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(
+        format=format_str,
+        level=logging.DEBUG
+    )
+
+    parser: argparse.ArgumentParser = argparse.ArgumentParser()
+    parser.add_argument('--dim', nargs=1, type=int, required=True, help='number of dimensions at LSI model')
+    args: argparse.Namespace = parser.parse_args(arg_str)
+
+    processed_docs: List[List[str]] = read_documents_and_gen_idx_text('tags-wd-tagger.txt')
+
+    # image file => doc_id
+    dictionary: corpora.Dictionary = corpora.Dictionary(processed_docs)
+    # remove frequent tags
+    #dictionary.filter_n_most_frequent(500)
+
+    with open('lsi_dictionary', 'wb') as f:
+        pickle.dump(dictionary, f)
+
+    corpus: List[List[Tuple[int, int]]] = [dictionary.doc2bow(doc) for doc in processed_docs]
+
+    # gen LSI model with specified number of topics (dimensions)
+    # ATTENTION: num_topics should be set to appropriate value!!!
+    # lsi_model: LsiModel = LsiModel(corpus, id2word=dictionary, num_topics=800)
+    lsi_model: LsiModel = LsiModel(corpus, id2word=dictionary, num_topics=args.dim[0])
+
+    lsi_model.save("lsi_model")
+
+    # make similarity index
+    index: MatrixSimilarity = MatrixSimilarity(lsi_model[corpus])
+
+    index.save("lsi_index")
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/hooks/hook-streamlit.py b/hooks/hook-streamlit.py
@@ -0,0 +1,2 @@
+from PyInstaller.utils.hooks import copy_metadata
+datas = copy_metadata('streamlit')
diff --git a/packaging.bat b/packaging.bat
@@ -20,11 +20,8 @@ IF NOT EXIST "%output_dir%" (
     MKDIR "%output_dir%"
 )
 
-REM For each Python script in the input directory
-FOR %%f IN ("%input_dir%\*.py") DO (
-    REM Package the script into an executable without the onefile option
-    pyinstaller --distpath "%output_dir%" --workpath "%output_dir%\build" --specpath "%output_dir%\spec" --noconfirm "%%f"
-)
+REM Package the script into an executable without the onefile option
+pyinstaller --distpath "%output_dir%" --workpath "%output_dir%\build" --specpath "%output_dir%\spec" --noconfirm "%input_dir%\cmd_run.py
 
 REM Clean up build and spec files
 RMDIR /S /Q "%output_dir%\build"

diff --git a/packaging_webui_step1.bat b/packaging_webui_step1.bat
@@ -0,0 +1,12 @@
+@echo off
+REM Batch file to package Python scripts into Windows executables
+REM Usage: packaging_webui.bat
+
+REM Package the script into an executable without the onefile option
+pyinstaller --additional-hooks-dir=./hooks --noconfirm run_webui.py --clean
+
+REM Clean up build and spec files
+REM RMDIR /S /Q build
+REM RMDIR /S /Q spec
+
+ECHO Packaging complete.
diff --git a/packaging_webui_step2.bat b/packaging_webui_step2.bat
@@ -0,0 +1,12 @@
+@echo off
+REM Batch file to package Python scripts into Windows executables
+REM Usage: packaging_webui.bat
+
+REM Package the script into an executable without the onefile option
+pyinstaller --noconfirm run_webui.spec --clean
+
+REM Clean up build and spec files
+REM RMDIR /S /Q build
+REM RMDIR /S /Q __pycache__
+
+ECHO Packaging complete.
diff --git a/requirements_with_packager.txt b/requirements_with_packager.txt
diff --git a/run_webui.py b/run_webui.py
@@ -0,0 +1,11 @@
+import streamlit.web.cli as stcli
+import os
+import sys
+
+def streamlit_run():
+    src = os.path.dirname(sys.executable) + '/webui.py'
+    sys.argv=['streamlit', 'run', src, '--global.developmentMode=false']
+    sys.exit(stcli.main())
+
+if __name__ == "__main__":
+    streamlit_run()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from PyInstaller.utils.hooks import copy_metadata
		datas = copy_metadata('streamlit')