Merge pull request #3145 from alibaba/feature/sync

MNN:Sync: sync internal 3.0.3
alibaba · Dec 31, 2024 · d6e10c2 · d6e10c2
2 parents 20a2d95 + 42b37ac
commit d6e10c2
Show file tree

Hide file tree

Showing 74 changed files with 18,339 additions and 23,808 deletions.
diff --git a/docs/faq.md b/docs/faq.md
@@ -258,6 +258,17 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
 ### Register 相关内存泄露说明
 用 valgrind 工具检查时会报 MNN Register 相关的内存泄露，这个属于一次性的初始化内存，后续也不会增加，可视为误报
 
+### Metal 相关内存增长说明
+
+Metal 后端使用的是OC对象，需要用OC的自动回收机制来清除内存，可在使用代码中把相关mnn的API调用放到autorealse中以自动回收内存
+
+```
+@autoreleasepool {
+    /* MNN 相关调用代码 */
+}
+
+```
+
 
 ## 性能相关
 ### 使用 GPU 时，调用 copyToHostTensor / readMap 非常慢

diff --git a/docs/tools/compress.md b/docs/tools/compress.md
@@ -28,7 +28,7 @@ MNN模型压缩工具提供了包括低秩分解、剪枝、量化等模型压
 | 训练量化 | 将float卷积转换为int8卷积计算，需要进行训练，可提高量化模型精度，降低存储量到原始模型的四分之一，降低内存，加速计算（某些模型可能会比float模型慢，因为float的优化方法和int8不同） | LSQ，OAQ，WAQ |
 | 直接权值量化 | 仅将模型中的权值进行量化，计算时还原为float进行计算，因此仅减少模型存储量，计算速度和float相同，可以在模型转换时一键完成，8bit量化情况下，精度基本不变，模型大小减小到原来的1/4 | 对称量化，非对称量化 |
 | 训练权值量化 | 特点同直接权值量化，但通过mnncompress压缩算法插件实现，因而可以提供更低比特的权值量化，以减少更多的存储量，并提高权值量化之后模型的精度，例如4bit量化情况下，模型大小减小到原来的1/8 | 对称量化 |
-| FP16 | 将FP32计算转换为FP16计算，可在模型转换时一键完成，模型大小减小为原来的1/2，精度基本无损 | - |
+| FP16 | 将FP32的权重转换成FP16的类型，可在模型转换时一键完成，模型大小减小为原来的1/2，精度基本无损 | - |
 
 ### 怎么用？
 1. 使用模型转换工具中的压缩功能无需额外数据，只要在模型转换时加对应参数即可，开启动态量化功能后也可以对卷积等计算量大的算子实现量化加速。
@@ -64,19 +64,42 @@ MNN模型压缩工具提供了包括低秩分解、剪枝、量化等模型压
 --weightQuantBits 8 [--weightQuantAsymmetric](可选) [--weightQuantBlock 128](可选) 
 ```
 `--weightQuantAsymmetric` 选项是指使用非对称量化方法，精度要比默认的对称量化精度好一些。
-`--weightQuantBlock 128` 表示以128为单位进行量化，如不设置则以输入通道数为单位进行量化。如果牺牲一些存储大小来提升量化精度，可以增加这个设置，理论上越小精度越高，但建议不要低于32。
+`--weightQuantBlock 128` 表示以128为单位进行量化，如不设置则以输入通道数为单位进行量化。若希望牺牲一些存储空间来提升量化精度，可以增加这个设置。理论上越小精度越高，但不能低于32。
+
 - 动态量化
 可以通过如下方式打开MNN运行时的动态量化支持，使权值量化后的模型中卷积等核心算子使用量化计算，降低内存并提升性能
+
 1. 打开 MNN_LOW_MEMORY 编译宏编译 MNN （支持动态量化功能）
+```
+cmake .. -DMNN_LOW_MEMORY=ON
+```
+
 2. 使用 mnn 模型时 memory mode 设成 low 
 
+```
+MNN::ScheduleConfig config;
+BackendConfig backendConfig;
+backendConfig.memory = BackendConfig::Memory_Low;
+config.backendConfig     = &backendConfig;
+```
+
 ### FP16压缩
-- 将模型中FP32权值转换为FP16存储，并在支持的设备上开启FP16推理，可以获得推理加速，并且速度减少到原来的1/2。可以在模型转换时一键完成，使用方便。
+- 将模型中FP32权值转换为FP16存储，大小减少到原来的1/2。
 - 使用`MNNConvert`（c++）或者`mnnconvert`（python包中自带）进行转换，转换命令行中加上下述选项即可：
 ```bash
 --fp16
 ```
 
+注意：FP16压缩与FP16加速无关，只要设置 precision = low ，无论 FP32 还是 FP16 的模型，MNN 都会在支持的设备上启用FP16加速功能
+
+```
+MNN::ScheduleConfig config;
+BackendConfig backendConfig;
+backendConfig.precision = BackendConfig::Precision_Low;
+config.backendConfig     = &backendConfig;
+```
+
+
 ## 离线量化工具
 ### 离线量化工具安装
 - c++工具安装

diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -233,6 +233,13 @@ void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
 void Executor::RuntimeManager::setExternalPath(std::string path, int type) {
     mInside->modes.setExternalPath(path, type);
 }
+void Executor::RuntimeManager::setHintPtr(Interpreter::HintMode mode, void* value) {
+    auto current = ExecutorScope::Current();
+    auto rt = current->getRuntime();
+    for (auto& iter : rt.first) {
+        iter.second->pMeta = value;
+    }
+}
 
 bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) {
     // Only support get memory

diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
@@ -236,6 +236,12 @@ class MNN_PUBLIC Interpreter {
         KVCACHE_SIZE_LIMIT = 8,
         // Op encoder number for commit
         OP_ENCODER_NUMBER_FOR_COMMIT = 9,
+
+        // KVCache Info
+        KVCACHE_INFO = 10,
+        // mmap allocate file size, KB
+        MMAP_FILE_SIZE = 11,
+        USE_CACHED_MMAP = 12
     };
 
     enum ExternalPathType {

diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -75,6 +75,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 3
 #define MNN_VERSION_MINOR 0
-#define MNN_VERSION_PATCH 2
+#define MNN_VERSION_PATCH 3
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp
@@ -125,6 +125,7 @@ class MNN_PUBLIC Executor {
         friend class Executor;
         void setMode(Interpreter::SessionMode mode);
         void setHint(Interpreter::HintMode mode, int value);
+        void setHintPtr(Interpreter::HintMode mode, void* value);
         bool getInfo(Interpreter::SessionInfoCode code, void* ptr);
         BackendConfig* getBnConfig();
         const RuntimeAttr* getInside() const {

diff --git a/project/android/build_32.sh b/project/android/build_32.sh
@@ -9,6 +9,7 @@ cmake ../../../ \
 -DMNN_USE_LOGCAT=false \
 -DMNN_USE_SSE=OFF \
 -DMNN_BUILD_TEST=ON \
+-DMNN_ARM82=OFF \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3 $4 $5 $6 $7
 

diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h
@@ -52,6 +52,25 @@ static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
     return toPyObj<int, toPyObj>(output_ids);
 }
 
+static PyObject* PyMNNLLM_eraseHistory(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
+    size_t history = 0;
+    size_t end = 0;
+    if (!PyArg_ParseTuple(args, "LL", &history, &end)) {
+        Py_RETURN_NONE;
+    }
+    self->llm->eraseHistory(history, end);
+    Py_RETURN_NONE;
+}
+static PyObject* PyMNNLLM_getCurrentHistory(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
+    auto history = self->llm->getCurrentHistory();
+    return PyLong_FromLong(history);
+}
 static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
     if (self->is_embedding) {
         Py_RETURN_NONE;
@@ -62,8 +81,8 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
         Py_RETURN_NONE;
     }
     std::ostringstream null_os;
-    auto res = self->llm->response(query, stream ? &std::cout : &null_os);
-    return string2Object(res);
+    self->llm->response(query, stream ? &std::cout : &null_os);
+    return string2Object(null_os.str());
 }
 
 static PyObject* PyMNNLLM_tokenizer_encode(LLM *self, PyObject *args) {
@@ -109,6 +128,8 @@ static PyMethodDef PyMNNLLM_methods[] = {
     {"forward", (PyCFunction)PyMNNLLM_forward, METH_VARARGS, "forward `logits` by `input_ids`."},
     {"generate", (PyCFunction)PyMNNLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."},
     {"response", (PyCFunction)PyMNNLLM_response, METH_VARARGS, "response `query` without hsitory."},
+    {"get_current_history", (PyCFunction)PyMNNLLM_getCurrentHistory, METH_VARARGS, "Get Current History."},
+    {"erase_history", (PyCFunction)PyMNNLLM_eraseHistory, METH_VARARGS, "Erase History."},
     {"tokenizer_encode", (PyCFunction)PyMNNLLM_tokenizer_encode, METH_VARARGS, "tokenizer encode."},
     {"tokenizer_decode", (PyCFunction)PyMNNLLM_tokenizer_decode, METH_VARARGS, "tokenizer decode."},
     {"txt_embedding", (PyCFunction)PyMNNLLM_txt_embedding, METH_VARARGS, "txt embedding."},

diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp
@@ -177,7 +177,7 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
         backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
         backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
         backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
-        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);    
+        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);
     }
     return NO_ERROR;
 }
@@ -193,9 +193,6 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     int mask_kvlen  = mask->length(3);
     int seq_len = query->length(1);
     MNN_ASSERT(seq_len == mask_seqlen);
-    mIsPrefill = (seq_len > 1);
-    // isPrefill and mask is Square Matrix, is FirstPrefill
-    mIsFirstPrefill = mIsPrefill && (mask_kvlen == mask_seqlen);
     int tileCount = UP_DIV(mNumHead, mThreadNum);
     int group_size = mNumHead / mKvNumHead;
     // reduce the value of 'query' to avoid fp16 overflow
@@ -215,15 +212,12 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
         mScale /= q_scale;
     }
 
-    if (mIsPrefill) {
-        if (mIsFirstPrefill) {
-            mKVCacheManager->onClear();
-            mKVCacheManager->onAlloc(seq_len);
-        } else {
-            mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + seq_len);
-        }
-    } else { // Decode
-        mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + 1);
+    if (mMeta->previous == mMeta->remove) {
+        mKVCacheManager->onClear();
+        mKVCacheManager->onAlloc(mMeta->add);
+    } else {
+        MNN_ASSERT(mMeta->previous == mKVCacheManager->kvLength());
+        mKVCacheManager->onRealloc(mMeta);
     }
     // Add the new kv to the kvcache
     mKVCacheManager->onPushBack(key, value);
@@ -383,6 +377,7 @@ bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) {
 }
 
 CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend), mKVCache(kv_cache) {
+    mMeta = (KVMeta*)(backend->getRuntime()->pMeta);
     if (mKVCache) {
         mPackQ.reset(Tensor::createDevice<float>({1, 1, 1, 1}));
         mPackQKV.reset(Tensor::createDevice<float>({1, 1, 1, 1}));

diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp
@@ -13,6 +13,7 @@
 
 #include <functional>
 #include "core/Execution.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "MNN/ErrorCode.hpp"
 #include "KVCacheManager.hpp"
 
@@ -26,8 +27,6 @@ class CPUAttention : public Execution {
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
-    bool mIsPrefill      = true;
-    bool mIsFirstPrefill = true;
     bool mKVCache        = true;
     bool mUseGemmInt8    = false;
     int bytes = 4;
@@ -40,6 +39,7 @@ class CPUAttention : public Execution {
     std::vector<float> mMinQ, mMaxQ, mQueryScale, mQueryZeroPoint;
     template <typename T> void pack_query(Tensor* query, char* pack_q, char* sum_q, int seq_len, int h, float q_scale);
     template <typename T> void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len);
+    KVMeta* mMeta;
 };
 
 } // namespace MNN

diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
@@ -19,6 +19,7 @@
 #include "CPUCast.hpp"
 #include "core/OpCommonUtils.hpp"
 #include "core/WrapExecution.hpp"
+#include "core/MNNFileUtils.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif // _OPENMP
@@ -194,6 +195,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
     MNN_PRINT("create CPURuntime:%p\n", this);
 #endif
 }
+
 CPURuntime:: ~ CPURuntime() {
 #ifdef MNN_USE_THREAD_POOL
     ThreadPool::releaseWorkIndex(mTaskIndex);
@@ -223,18 +225,31 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
         if (mDynamicMmap.empty()) {
             // Only support set featuremap dir once
             mDynamicMmap.resize(2);
-            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().midMemoryPath.c_str(), "dynamic");
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().midMemoryPath.c_str(), "", "dynamic");
             for (auto& buf : mDynamicMmap) {
                 buf.root = mmapMem;
             }
         }
     }
     if (hint().weightMemoryPath.size() > 0) {
+        // forward_type, precision_type, memory_type, power_type
+        std::string prefix = "0_0_0_0_";
+        prefix[2] += mPrecision;
+        prefix[4] += mMemory;
+        prefix[6] += mPower;
+        // prefix += hint().modelUUID + "_";
+        bool autoRemove = true;
+        if (hint().useCachedMmap) {
+            autoRemove = false;
+            std::string fileName = MNNFilePathConcat(hint().weightMemoryPath, prefix + "0.static");
+            const_cast<RuntimeHint&>(hint()).useCachedMmap += MNNFileExist(fileName.c_str());
+        }
         if (nullptr == mStaticAllocatorCache.get()) {
             // Only support set weightmap dir once
             mStaticAllocatorCache = mStaticAllocator;
-            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "static");
-            mStaticAllocator.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), prefix.c_str(), "static", autoRemove);
+            int mmapSize = hint().mmapFileSize * 1024 * 1024;
+            mStaticAllocator.reset(new EagerBufferAllocator(mmapMem, 32, mmapSize));
         }
     }
     auto precision = mPrecision;
@@ -551,7 +566,7 @@ static OpType _getRealOpType(OpType opType) {
             return OpType_DepthwiseConvInt8;
         case OpType_Pooling:
             return OpType_PoolInt8;
-        
+
         // case OpType_Eltwise:
         //     // TODO: just support EltwiseAdd
         //     return OpType_EltwiseInt8;

diff --git a/source/backend/cpu/CPUStft.cpp b/source/backend/cpu/CPUStft.cpp
@@ -7,7 +7,11 @@
 //
 
 #ifdef MNN_BUILD_AUDIO
-
+#ifndef M_PI
+#define M_PI 3.141592654
+#endif
+#include <algorithm>
+#include <cmath>
 #include "backend/cpu/CPUStft.hpp"
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Concurrency.h"
@@ -16,6 +20,21 @@
 #include "compute/CommonOptFunction.h"
 
 namespace MNN {
+static void MNNDftAbs(const float* input, const float* window, float* output, float* buffer, int nfft) {
+    for (int i = 0; i < nfft; ++i) {
+        buffer[i] = input[i] * window[i];
+    }
+    for (int k = 0; k < nfft / 2 + 1; ++k) {
+        float real_sum = 0.f, imag_sum = 0.f;
+        for (int n = 0; n < nfft; ++n) {
+            float angle = 2 * M_PI * k * n / nfft;
+            real_sum += buffer[n] * cosf(angle);
+            imag_sum -= buffer[n] * sinf(angle);
+        }
+        output[k] = sqrtf(real_sum * real_sum + imag_sum * imag_sum);
+    }
+}
+
 
 CPUStft::CPUStft(Backend* backend, int nfft, int hop_length, bool abs)
     : Execution(backend), mNfft(nfft), mHopLength(hop_length), mAbs(abs) {
@@ -72,4 +91,4 @@ class CPUStftCreator : public CPUBackend::Creator {
 
 REGISTER_CPU_OP_CREATOR_AUDIO(CPUStftCreator, OpType_Stft);
 } // namespace MNN
-#endif // MNN_BUILD_AUDIO
+#endif // MNN_BUILD_AUDIO