Merge pull request #127 from acon96/release/v0.2.13

Release v0.2.13
acon96 · Apr 25, 2024 · d6b1aa0 · d6b1aa0
2 parents d61b9b9 + d3f0ebd
commit d6b1aa0
Show file tree

Hide file tree

Showing 63 changed files with 9,970 additions and 1,089 deletions.
diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
@@ -13,14 +13,32 @@ permissions:
 
 jobs:
   build_wheels:
-    name: Build wheels on ${{ matrix.arch }} (HA ${{ matrix.home_assistant_version }})
+    name: Build wheels on ${{ matrix.arch }}${{ matrix.suffix }} (HA ${{ matrix.home_assistant_version }})
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         home_assistant_version: ["2023.12.4", "2024.2.1"]
-        arch: ["aarch64", "armhf", "amd64", "i386"]
-
+        arch: [aarch64, armhf, amd64, i386]
+        suffix: [""]
+        include:
+        - home_assistant_version: "2024.2.1"
+          arch: "amd64"
+          suffix: "-noavx"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
+        - home_assistant_version: "2024.2.1"
+          arch: "amd64"
+          suffix: "-avx512"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX512=ON"
+        - home_assistant_version: "2024.2.1"
+          arch: "i386"
+          suffix: "-noavx"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"
+        - home_assistant_version: "2024.2.1"
+          arch: "i386"
+          suffix: "-avx512"
+          extra_defines: "-DLLAMA_NATIVE=OFF -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX512=ON"
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -56,16 +74,24 @@ jobs:
             cd /tmp
             git clone --quiet --recurse-submodules https://github.com/abetlen/llama-cpp-python --branch "v${{ env.EMBEDDED_LLAMA_CPP_PYTHON_VERSION }}"
             cd llama-cpp-python
-            
-            export CMAKE_ARGS="-DLLAVA_BUILD=OFF"
+
+            export CMAKE_ARGS="-DLLAVA_BUILD=OFF ${{ matrix.extra_defines }}"
             python3 -m build --wheel
-            cp -f ./dist/*.whl /artifacts/
+
+            ls -la ./dist/
+            for filename in ./dist/*.whl; do
+              output_file=$(basename $filename .whl)${{ matrix.suffix }}.whl
+              echo "$filename -> $output_file"
+              mv "$filename" "/artifacts/${output_file}";
+            done;
+
+            ls -la /artifacts/
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: ./artifacts/*.whl
-          name: artifact_${{ matrix.arch }}_${{ matrix.home_assistant_version }}
+          name: artifact_${{ matrix.arch }}${{ matrix.suffix }}_${{ matrix.home_assistant_version }}
 
   release:
     name: Create Release

diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ main.log
 .venv
 *.xlsx
 notes.txt
+runpod_bootstrap.sh
diff --git a/README.md b/README.md
@@ -7,29 +7,28 @@ This project provides the required "glue" components to control your Home Assist
 Please see the [Setup Guide](./docs/Setup.md) for more information on installation.
 
 ## LLama Conversation Integration
-In order to integrate with Home Assistant, we provide a `custom_component` that exposes the locally running LLM as a "conversation agent".
+In order to integrate with Home Assistant, we provide a custom component that exposes the locally running LLM as a "conversation agent".
 
 This component can be interacted with in a few ways:  
 - using a chat interface so you can chat with it.
 - integrating with Speech-to-Text and Text-to-Speech addons so you can just speak to it.
 
-The component can either run the model directly as part of the Home Assistant software using llama-cpp-python, or you can run the [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) project to provide access to the LLM via an API interface.
-
-When doing this, you can host the model yourself and point the add-on at machine where the model is hosted, or you can run the model using text-generation-webui using the provided [custom Home Assistant add-on](./addon).
+The component can either run the model directly as part of the Home Assistant software using llama-cpp-python, or you can run [Ollama](https://ollama.com/) (simple) or the [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) project (advanced) to provide access to the LLM via an API interface.
 
 ## Home LLM Model
-The "Home" models are a fine tuning of the Phi model series from Microsoft and the StableLM model series from StabilityAI.  The model is able to control devices in the user's house as well as perform basic question and answering.  The fine tuning dataset is a [custom synthetic dataset](./data) designed to teach the model function calling based on the device information in the context.
+The "Home" models are a fine tuning of various Large Languages Models that are under 5B parameters.  The models are able to control devices in the user's house as well as perform basic question and answering.  The fine tuning dataset is a [custom synthetic dataset](./data) designed to teach the model function calling based on the device information in the context.
 
 The latest models can be found on HuggingFace:  
 3B v3 (Based on StableLM-Zephyr-3B): https://huggingface.co/acon96/Home-3B-v3-GGUF  (Zephyr prompt format)  
-1B v2 (Based on Phi-1.5): https://huggingface.co/acon96/Home-1B-v2-GGUF  (ChatML prompt format)  
+1B v3 (Based on TinyLlama-1.1B): https://huggingface.co/acon96/Home-1B-v3-GGUF  (Zephyr prompt format)  
 
 <details>
 
 <summary>Old Models</summary>  
 
 3B v2 (Based on Phi-2): https://huggingface.co/acon96/Home-3B-v2-GGUF  (ChatML prompt format)  
 3B v1 (Based on Phi-2): https://huggingface.co/acon96/Home-3B-v1-GGUF  (ChatML prompt format)  
+1B v2 (Based on Phi-1.5): https://huggingface.co/acon96/Home-1B-v2-GGUF  (ChatML prompt format)  
 1B v1 (Based on Phi-1.5): https://huggingface.co/acon96/Home-1B-v1-GGUF  (ChatML prompt format)  
 
 </details>
@@ -41,6 +40,7 @@ The model can be used as an "instruct" type model using the [ChatML](https://git
 Example "system" prompt: 
 ```
 You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed with the information provided only.
+The current time and date is 08:12 AM on Thursday March 14, 2024
 Services: light.turn_off(), light.turn_on(brightness,rgb_color), fan.turn_on(), fan.turn_off()
 Devices:
 light.office 'Office Light' = on;80%
@@ -80,45 +80,40 @@ The dataset is available on HuggingFace: https://huggingface.co/datasets/acon96/
 The source for the dataset is in the [data](/data) of this repository.
 
 ### Training
-The 3B model was trained as a LoRA on an RTX 3090 (24GB) using the following settings for the custom training script. The embedding weights were "saved" and trained normally along with the rank matricies in order to train the newly added tokens to the embeddings. The full model is merged together at the end. Training took approximately 10 hours.
+The 3B model was trained as a full fine-tuning on 2x RTX 4090 (48GB). Training time took approximately 28 hours. It was trained on the `--large` dataset variant.
 
 <details>
 <summary>Training Arguments</summary>
 
 ```console
-python3 train.py \
+accelerate launch --config_file fsdp_config.yaml train.py \
     --run_name home-3b \
-    --base_model microsoft/phi-2 \
-    --add_pad_token \
-    --add_chatml_tokens \
+    --base_model stabilityai/stablelm-zephyr-3b \
     --bf16 \
-    --train_dataset data/home_assistant_alpaca_merged_train.json \
-    --learning_rate 1e-5 \
-    --save_steps 1000 \
-    --micro_batch_size 2 --gradient_checkpointing \
+    --train_dataset data/home_assistant_train.jsonl \
+    --learning_rate 1e-5 --batch_size 64 --epochs 1 \
+    --micro_batch_size 2 --gradient_checkpointing --group_by_length \
     --ctx_size 2048 \
-    --group_by_length \
-    --use_lora --lora_rank 32 --lora_alpha 64 --lora_modules fc1,fc2,q_proj,v_proj,dense --lora_modules_to_save embed_tokens,lm_head --lora_merge
+    --save_steps 50 --save_total_limit 10 --eval_steps 100 --logging_steps 2
 ```
 
 </details>
 
-The 1B model was trained as a full fine-tuning on on an RTX 3090 (24GB). Training took approximately 2.5 hours.
+The 1B model was trained as a full fine-tuning on an RTX 3090 (24GB). Training took approximately 2 hours. It was trained on the `--medium` dataset variant.
 
 <details>
 <summary>Training Arguments</summary>
 
 ```console
 python3 train.py \
     --run_name home-1b \
-    --base_model microsoft/phi-1_5 \
-    --add_pad_token \
-    --add_chatml_tokens \
+    --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --bf16 \
-    --train_dataset data/home_assistant_train.json \
-    --learning_rate 1e-5 \
-    --micro_batch_size 4 --gradient_checkpointing \
-    --ctx_size 2048
+    --train_dataset data/home_assistant_train.jsonl \
+    --test_dataset data/home_assistant_test.jsonl \
+    --learning_rate 2e-5 --batch_size 32 \
+    --micro_batch_size 8 --gradient_checkpointing --group_by_length \
+    --ctx_size 2048 --save_steps 100 --save_total_limit 10
 ```
 
 </details>
@@ -129,19 +124,20 @@ In order to facilitate running the project entirely on the system where Home Ass
 
 
 ## Version History
-| Version | Description                                                                                                                                                                                         |
-| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| v0.2.12 | Fix cover ICL examples, allow setting number of ICL examples, add min P and typical P sampler options, recommend models during setup, add JSON mode for Ollama backend, fix missing default options |
-| v0.2.11 | Add prompt caching, expose llama.cpp runtime settings, build llama-cpp-python wheels using GitHub actions, and install wheels directly from GitHub                                                  |
-| v0.2.10 | Allow configuring the model parameters during initial setup, attempt to auto-detect defaults for recommended models, Fix to allow lights to be set to max brightness                                |
-| v0.2.9  | Fix HuggingFace Download, Fix llama.cpp wheel installation, Fix light color changing, Add in-context-learning support                                                                               | 
-| v0.2.8  | Fix ollama model names with colons                                                                                                                                                                  |
-| v0.2.7  | Publish model v3, Multiple Ollama backend improvements, Updates for HA 2024.02, support for voice assistant aliases                                                                                 |
-| v0.2.6  | Bug fixes, add options for limiting chat history, HTTPS endpoint support, added zephyr prompt format.                                                                                               |
-| v0.2.5  | Fix Ollama max tokens parameter, fix GGUF download from Hugging Face, update included llama-cpp-python to 0.2.32, and add parameters to function calling for dataset + component, & model update    |
-| v0.2.4  | Fix API key auth on model load for text-generation-webui, and add support for Ollama API backend                                                                                                    |
-| v0.2.3  | Fix API key auth, Support chat completion endpoint, and refactor to make it easier to add more remote backends                                                                                      |
-| v0.2.2  | Fix options window after upgrade, fix training script for new Phi model format, and release new models                                                                                              |
-| v0.2.1  | Properly expose generation parameters for each backend, handle config entry updates without reloading, support remote backends with an API key                                                      |
-| v0.2    | Bug fixes, support more backends, support for climate + switch devices, JSON style function calling with parameters, GBNF grammars                                                                  |
-| v0.1    | Initial Release                                                                                                                                                                                     |
+| Version | Description                                                                                                                                                                                                          |
+|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| v0.2.13 | Add support for Llama 3, build llama.cpp wheels that are compatible with non-AVX systems, fix an error with exposing script entities, fix multiple small Ollama backend issues, and add basic multi-language support |
+| v0.2.12 | Fix cover ICL examples, allow setting number of ICL examples, add min P and typical P sampler options, recommend models during setup, add JSON mode for Ollama backend, fix missing default options                  |
+| v0.2.11 | Add prompt caching, expose llama.cpp runtime settings, build llama-cpp-python wheels using GitHub actions, and install wheels directly from GitHub                                                                   |
+| v0.2.10 | Allow configuring the model parameters during initial setup, attempt to auto-detect defaults for recommended models, Fix to allow lights to be set to max brightness                                                 |
+| v0.2.9  | Fix HuggingFace Download, Fix llama.cpp wheel installation, Fix light color changing, Add in-context-learning support                                                                                                |
+| v0.2.8  | Fix ollama model names with colons                                                                                                                                                                                   |
+| v0.2.7  | Publish model v3, Multiple Ollama backend improvements, Updates for HA 2024.02, support for voice assistant aliases                                                                                                  |
+| v0.2.6  | Bug fixes, add options for limiting chat history, HTTPS endpoint support, added zephyr prompt format.                                                                                                                |
+| v0.2.5  | Fix Ollama max tokens parameter, fix GGUF download from Hugging Face, update included llama-cpp-python to 0.2.32, and add parameters to function calling for dataset + component, & model update                     |
+| v0.2.4  | Fix API key auth on model load for text-generation-webui, and add support for Ollama API backend                                                                                                                     |
+| v0.2.3  | Fix API key auth, Support chat completion endpoint, and refactor to make it easier to add more remote backends                                                                                                       |
+| v0.2.2  | Fix options window after upgrade, fix training script for new Phi model format, and release new models                                                                                                               |
+| v0.2.1  | Properly expose generation parameters for each backend, handle config entry updates without reloading, support remote backends with an API key                                                                       |
+| v0.2    | Bug fixes, support more backends, support for climate + switch devices, JSON style function calling with parameters, GBNF grammars                                                                                   |
+| v0.1    | Initial Release                                                                                                                                                                                                      |
diff --git a/addon/Dockerfile b/addon/Dockerfile
@@ -23,7 +23,7 @@ RUN \
         python3-venv \
         python3-pip \
     \
-    && git clone https://github.com/oobabooga/text-generation-webui.git ${APP_DIR} --branch snapshot-2024-04-07 \
+    && git clone https://github.com/oobabooga/text-generation-webui.git ${APP_DIR} --branch snapshot-2024-04-14 \
     && python3 -m pip install torch torchvision torchaudio py-cpuinfo==9.0.0 \
     && python3 -m pip install -r ${APP_DIR}/requirements_cpu_only_noavx2.txt llama-cpp-python \
     && apt-get purge -y --auto-remove \

diff --git a/addon/config.yaml b/addon/config.yaml
@@ -1,6 +1,6 @@
 ---
 name: oobabooga-text-generation-webui
-version: 2024.04.09
+version: 2024.04.14
 slug: text-generation-webui
 description: "A tool for running Large Language Models"
 url: "https://github.com/oobabooga/text-generation-webui"

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
@@ -489,6 +489,11 @@ def expose_attributes(attributes):
         all_services = []
         all_service_names = []
         for domain in domains:
+            # scripts show up as individual services
+            if domain == "script":
+                all_services.extend(["script.reload()", "script.turn_on()", "script.turn_off()", "script.toggle()"])
+                continue
+
             for name, service in service_dict.get(domain, {}).items():
                 args = flatten_vol_schema(service.schema)
                 args_to_expose = set(args).intersection(allowed_service_call_arguments)
@@ -856,7 +861,7 @@ def _extract_response(self, response_json: dict) -> str:
         if choices[0]["finish_reason"] != "stop":
             _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
 
-        if response_json["object"] == "chat.completion":
+        if response_json["object"] in ["chat.completion", "chat.completion.chunk"]:
             return choices[0]["message"]["content"]
         else:
             return choices[0]["text"]
@@ -1075,12 +1080,12 @@ def _completion_params(self, conversation: dict) -> (str, dict):
 
         endpoint = "/api/generate"
         request_params["prompt"] = self._format_prompt(conversation)
-        # request_params["raw"] = True # ignore prompt template
+        request_params["raw"] = True # ignore prompt template
 
         return endpoint, request_params
 
     def _extract_response(self, response_json: dict) -> str:        
-        if response_json["done"] != "true":
+        if response_json["done"] not in ["true", True]:
             _LOGGER.warning("Model response did not end on a stop token (unfinished sentence)")
 
         # TODO: this doesn't work because ollama caches prompts and doesn't always return the full prompt length
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ main.log @@
     .venv
     *.xlsx
     notes.txt
+    runpod_bootstrap.sh