From 88834357ab817ee80e4f8843046cac04aedf3aeb Mon Sep 17 00:00:00 2001 From: gushiqiao Date: Fri, 18 Oct 2024 18:58:56 +0800 Subject: [PATCH 1/5] Add spinquant --- README.md | 238 ++++-- README_ja.md | 266 +++--- README_zh.md | 291 ++++--- benchmark/align.md | 42 + benchmark/calib.md | 53 ++ ci_check/awq_w4a16_fakequant_eval.yml | 1 - ci_check/change_files.py | 40 +- ci_check/cpu.txt | 2 - ci_check/run.sh | 17 +- .../AdaDim/adadim_w8a8_fakequant_eval.yml | 37 + configs/quantization/Awq/awq_w4a16.yml | 33 + .../Awq/awq_w4a16_fakequant_eval.yml | 32 + .../Awq/awq_w4a16_fakequant_eval_general.yml | 37 + ...wq_w4a16_fakequant_eval_general_custom.yml | 36 + .../Awq/awq_w4a16_fakequant_trt-llm.yml | 36 + configs/quantization/Awq/awq_w4a4_best.yml | 52 ++ configs/quantization/Awq/awq_w4a8_best.yml | 52 ++ .../Awq/awq_w8a8_fakequant_eval_general.yml | 35 + .../Awq/awq_w_only_mix_bits_1.yml | 46 ++ .../Awq/awq_w_only_mix_bits_2.yml | 49 ++ configs/quantization/Awq/awq_wa_mix_bits.yml | 47 ++ .../DGQ/dgq_w4a8_fakequant_eval.yml | 41 + .../FP/awq_we2m1a16_128_fakequant_eval.yml | 33 + .../FP/rtn_w4a16_fakequant_eval.yml | 24 + .../FP/rtn_w8a8_fakequant_eval.yml | 27 + .../FP/rtn_we2m1a16_fakequant_eval.yml | 23 + .../FP/rtn_we2m1a16_fakequant_g128_eval.yml | 24 + .../FP/rtn_we2m1ae2m1_fakequant_eval.yml | 27 + .../FP/rtn_we4m3ae4m3_fakequant_eval.yml | 27 + .../FP/rtn_we5m2ae5m2_fakequant_eval.yml | 27 + .../GPTQ/gptq_owq_w4a16_fakequant_eval.yml | 41 + configs/quantization/GPTQ/gptq_quarot.yml | 51 ++ .../GPTQ/gptq_w4a16_fakequant_eval.yml | 39 + .../gptq_w4a16_fakequant_eval_general.yml | 39 + .../HQQ/hqq_w4a16_fakequant_eval.yml | 30 + .../LlmInt8/llmint8_w8a8_fakequant_eval.yml | 38 + .../ntweak_llama_w4a16_fakequant_eval.yml | 38 + .../ntweak_llama_w8a8_fakequant_eval.yml | 42 + .../OmniQuant/omniq_llama_w2a16_best.yml | 51 ++ .../omniq_llama_w2a16_fakequant_eval.yml | 49 ++ .../omniq_llama_w4a16_fakequant_eval.yml | 49 ++ .../OmniQuant/omniq_llama_w4a4_best.yml | 59 ++ .../OmniQuant/omniq_llama_w4a8_best.yml | 59 ++ .../omniq_llama_w8a8_fakequant_eval.yml | 51 ++ .../omniq_mistral_w8a8_fakequant_eval.yml | 49 ++ .../omniq_opt_w8a8_fakequant_eval.yml | 49 ++ .../OsPlus/osplus_llama_w4a4_best.yml | 46 ++ .../OsPlus/osplus_llama_w4a8_best.yml | 46 ++ ...plus_llama_w8a8_fakequant_eval_general.yml | 36 + ...osplus_opt_w8a8_fakequant_eval_general.yml | 36 + .../QUIK/quik_w4a4_fakequant_eval.yml | 41 + configs/quantization/QuaRot/quarot_w4a4.yml | 36 + configs/quantization/RTN/rtn_w4a16.yml | 16 + .../RTN/rtn_w4a16_fakequant_eval.yml | 23 + configs/quantization/RTN/rtn_w8a8.yml | 20 + .../RTN/rtn_w8a8_fakequant_eval.yml | 26 + .../RTN/rtn_w8a8_pertensor_static.yml | 36 + .../smoothquant_llama_w8a8_fakequant_eval.yml | 35 + ...uant_llama_w8a8_fakequant_eval_general.yml | 35 + .../smoothquant_llama_w8a8_trt-llm.yml | 35 + .../smoothquant_opt_w8a8_fakequant_eval.yml | 35 + ...hquant_opt_w8a8_fakequant_eval_general.yml | 35 + .../SpQR/spqr_w4a16_fakequant_eval.yml | 54 ++ .../quantization/SpinQuant/spinquant_w4a4.yml | 63 ++ .../sparsification/Magnitude/magnitude.yml | 30 + configs/sparsification/ShortGPT/shortgpt.yml | 30 + configs/sparsification/Wanda/wanda.yml | 31 + docs/en/source/advanced/model_test.md | 181 +++++ docs/en/source/configs.md | 102 +-- docs/en/source/index.rst | 20 +- docs/en/source/quickstart.md | 86 +- docs/zh_cn/source/advanced/model_test.md | 180 +++++ docs/zh_cn/source/configs.md | 107 +-- docs/zh_cn/source/index.rst | 19 +- docs/zh_cn/source/quickstart.md | 69 +- .../backend/autoawq/infer_with_autoawq.py | 34 - examples/backend/mlcllm/infer_with_mlcllm.py | 17 - examples/backend/sglang/infer_with_sglang.py | 13 - examples/backend/vllm/infer_with_vllm.py | 21 - llmc/__main__.py | 240 ++---- llmc/compression/blockwise_optimization.py | 6 +- llmc/compression/quantization/__init__.py | 4 +- llmc/compression/quantization/awq.py | 24 +- .../base_blockwise_quantization.py | 287 ++++--- llmc/compression/quantization/dgq.py | 31 +- llmc/compression/quantization/gptq.py | 100 ++- .../quantization/hadamard_utils.py | 102 ++- llmc/compression/quantization/hqq.py | 22 +- llmc/compression/quantization/llmint8.py | 4 +- llmc/compression/quantization/module_utils.py | 762 +++++++----------- llmc/compression/quantization/ntweak.py | 6 +- llmc/compression/quantization/omniq.py | 15 +- llmc/compression/quantization/osplus.py | 6 +- llmc/compression/quantization/quant.py | 520 ++++-------- llmc/compression/quantization/quarot.py | 55 +- llmc/compression/quantization/quik.py | 4 +- llmc/compression/quantization/rotate_utils.py | 102 +++ llmc/compression/quantization/rtn.py | 25 +- llmc/compression/quantization/smoothquant.py | 8 +- llmc/compression/quantization/spinquant.py | 231 ++++++ llmc/compression/quantization/spqr.py | 40 +- llmc/compression/quantization/train_utils.py | 187 +++++ llmc/compression/quantization/utils.py | 19 - .../base_blockwise_sparsification.py | 4 +- llmc/compression/sparsification/magnitude.py | 4 +- llmc/compression/sparsification/shortgpt.py | 4 +- llmc/compression/sparsification/wanda.py | 6 +- llmc/data/__init__.py | 2 +- llmc/data/dataset/__init__.py | 1 + llmc/data/dataset/base_dataset.py | 287 +------ llmc/data/dataset/specified_preproc.py | 88 +- llmc/data/dataset/train_dataset.py | 62 ++ llmc/data/tokenizer/base_tokenizer.py | 11 +- llmc/eval/__init__.py | 2 - llmc/eval/eval_ppl.py | 93 ++- llmc/eval/eval_token.py | 185 +++++ llmc/models/__init__.py | 11 - llmc/models/base_model.py | 78 +- llmc/models/bloom.py | 7 +- llmc/models/falcon.py | 10 +- llmc/models/gemma2.py | 37 +- llmc/models/internlm2.py | 18 +- llmc/models/llama.py | 15 +- llmc/models/llava.py | 22 +- llmc/models/mistral.py | 7 +- llmc/models/mixtral.py | 34 +- llmc/models/opt.py | 7 +- llmc/models/qwen2.py | 21 +- llmc/models/starcoder.py | 7 +- llmc/utils/__init__.py | 5 +- llmc/utils/utils.py | 14 - lm-evaluation-harness | 1 - requirements/runtime.txt | 25 +- scripts/run_adadim_llama.sh | 15 + scripts/run_awq_llama.sh | 16 + scripts/run_dgq_llama.sh | 16 + scripts/run_gptq_llama.sh | 15 + scripts/run_gptq_owq_llama.sh | 15 + scripts/run_hqq_llama.sh | 15 + scripts/run_in_tmux_sequence.sh | 25 + scripts/run_llmint8_llama.sh | 16 + scripts/run_ntweak_llama.sh | 16 + scripts/run_omniq_llama.sh | 16 + scripts/run_omniq_mistral.sh | 15 + scripts/run_omniq_opt.sh | 15 + scripts/run_osplus_llama.sh | 15 + scripts/run_osplus_opt.sh | 15 + scripts/run_quarot_llama.sh | 15 + scripts/run_quik_llama.sh | 15 + scripts/run_rtn_llama.sh | 15 + scripts/run_rtn_llama_static.sh | 15 + scripts/run_shortgpt_llama.sh | 15 + scripts/run_smoothquant_llama.sh | 15 + scripts/run_smoothquant_opt.sh | 15 + scripts/run_spinquant_llama.sh | 15 + scripts/run_spqr_llama.sh | 15 + scripts/run_wanda_llama.sh | 15 + tools/outlier_analysis.py | 483 +++++++++++ tools/token_analysis.py | 185 +++++ 159 files changed, 6223 insertions(+), 2711 deletions(-) create mode 100644 benchmark/align.md create mode 100644 benchmark/calib.md create mode 100644 configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/Awq/awq_w4a16.yml create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml create mode 100644 configs/quantization/Awq/awq_w4a4_best.yml create mode 100644 configs/quantization/Awq/awq_w4a8_best.yml create mode 100644 configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml create mode 100644 configs/quantization/Awq/awq_w_only_mix_bits_1.yml create mode 100644 configs/quantization/Awq/awq_w_only_mix_bits_2.yml create mode 100644 configs/quantization/Awq/awq_wa_mix_bits.yml create mode 100644 configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml create mode 100644 configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml create mode 100644 configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml create mode 100644 configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml create mode 100644 configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/GPTQ/gptq_quarot.yml create mode 100644 configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml create mode 100644 configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml create mode 100644 configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/OsPlus/osplus_llama_w4a4_best.yml create mode 100644 configs/quantization/OsPlus/osplus_llama_w4a8_best.yml create mode 100644 configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml create mode 100644 configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml create mode 100644 configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml create mode 100644 configs/quantization/QuaRot/quarot_w4a4.yml create mode 100644 configs/quantization/RTN/rtn_w4a16.yml create mode 100644 configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/RTN/rtn_w8a8.yml create mode 100644 configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/RTN/rtn_w8a8_pertensor_static.yml create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml create mode 100644 configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml create mode 100644 configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml create mode 100644 configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml create mode 100644 configs/quantization/SpinQuant/spinquant_w4a4.yml create mode 100644 configs/sparsification/Magnitude/magnitude.yml create mode 100644 configs/sparsification/ShortGPT/shortgpt.yml create mode 100644 configs/sparsification/Wanda/wanda.yml create mode 100644 docs/en/source/advanced/model_test.md create mode 100644 docs/zh_cn/source/advanced/model_test.md delete mode 100644 examples/backend/autoawq/infer_with_autoawq.py delete mode 100644 examples/backend/mlcllm/infer_with_mlcllm.py delete mode 100644 examples/backend/sglang/infer_with_sglang.py delete mode 100644 examples/backend/vllm/infer_with_vllm.py create mode 100644 llmc/compression/quantization/rotate_utils.py create mode 100644 llmc/compression/quantization/spinquant.py create mode 100644 llmc/data/dataset/train_dataset.py create mode 100644 llmc/eval/eval_token.py delete mode 160000 lm-evaluation-harness create mode 100644 scripts/run_adadim_llama.sh create mode 100644 scripts/run_awq_llama.sh create mode 100644 scripts/run_dgq_llama.sh create mode 100644 scripts/run_gptq_llama.sh create mode 100644 scripts/run_gptq_owq_llama.sh create mode 100644 scripts/run_hqq_llama.sh create mode 100644 scripts/run_in_tmux_sequence.sh create mode 100644 scripts/run_llmint8_llama.sh create mode 100644 scripts/run_ntweak_llama.sh create mode 100644 scripts/run_omniq_llama.sh create mode 100644 scripts/run_omniq_mistral.sh create mode 100644 scripts/run_omniq_opt.sh create mode 100644 scripts/run_osplus_llama.sh create mode 100644 scripts/run_osplus_opt.sh create mode 100644 scripts/run_quarot_llama.sh create mode 100644 scripts/run_quik_llama.sh create mode 100644 scripts/run_rtn_llama.sh create mode 100644 scripts/run_rtn_llama_static.sh create mode 100644 scripts/run_shortgpt_llama.sh create mode 100644 scripts/run_smoothquant_llama.sh create mode 100644 scripts/run_smoothquant_opt.sh create mode 100644 scripts/run_spinquant_llama.sh create mode 100644 scripts/run_spqr_llama.sh create mode 100644 scripts/run_wanda_llama.sh create mode 100644 tools/outlier_analysis.py create mode 100644 tools/token_analysis.py diff --git a/README.md b/README.md index fc51e66ab..81be260a6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# LLMC: Towards Accurate and Efficient LLM Compression +# llmc: Towards Accurate and Efficient LLM Compression llmc @@ -13,53 +13,20 @@ **\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]** -**LLMC** is an off-the-shell tool designed for compressing LLM, leveraging state-of-the-art compression algorithms to enhance efficiency and reduce model size without compromising performance. +**llmc** is an off-the-shell tool designed for compressing LLM, leveraging state-of-the-art compression algorithms to enhance efficiency and reduce model size without compromising performance. **English doc** is [here](https://llmc-en.readthedocs.io/en/latest/). **Chinese doc** is [here](https://llmc-zhcn.readthedocs.io/en/latest/). -**docker hub** is [here](https://hub.docker.com/r/llmcompression/llmc). - -**aliyun docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]` - -You can download the Docker image that can run llmc with the following command. Users in mainland China are recommended to use Alibaba Cloud Docker. - -docker hub - -``` -docker pull llmcompression/llmc:pure-latest -``` - -aliyun docker - -``` -docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest -``` - **Community**: - [Discord Server](https://discord.gg/qZKUDfhm) - [Tencent QQ Group](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) -## Latest News +## News -- **Sep 26, 2024:** 🔥 We now support exporting 💥`FP8 quantized(E4M3, E5M2)` models from 🚀`LLMC` to advanced inference backends such as [VLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang). For detailed usage, please refer to the [VLLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html) and [SGLang documentation](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html). - -- **Sep 24, 2024:** 🔥 We have officially released ✅INT4 and ✅INT8 models of ✨`Llama-3.1-405B`, quantized using 🚀`LLMC` in `save_lightllm` mode. You can download the model parameters [here](https://huggingface.co/Dongz/llama31-405b-quant). - -- **Sep 23, 2024:** 🔥 We now support exporting ✨`real quantized(INT4, INT8)` models from 🚀`LLMC` to advanced inference backends such as [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), and [MLC-LLM](https://github.com/mlc-ai/mlc-llm) for quantized inference deployment, enabling ✨`reduced memory usage` and ✨`faster inference speeds`. - For detailed usage, please refer to the [VLLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html), [SGLang documentation](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html), [AutoAWQ documentation](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html), and [MLC-LLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html). - -- **Sep 9, 2024:** 🔥 We provide some configs of our best practice towards superior performance (see Best Practice [here](https://llmc-en.readthedocs.io/en/latest/)). - -* **Sep 3, 2024:** 🔥 We support [opencompass](https://github.com/open-compass/opencompass) 🤗 to eval 🚀`LLMC` model. Follow this [doc](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v2.html) and have a try! - -* **Aug 22, 2024:** 🔥We support lots of small language models, including current SOTA [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)(see [Supported Model List](#supported-model-list)). - -* **Aug 22, 2024:** 🔥 Additionally, we also support down stream task evaluation through our modified [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 🤗. Specifically, people can first employ `save_trans` mode(see `save` part in [Configuration](https://llmc-en.readthedocs.io/en/latest/configs.html)) to save a weight modified model. After obtaining the transformed model, they can directly evaluate the quantized model referring to [run_lm_eval.sh](scripts/run_lm_eval.sh). More details can be found in [here](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v1.html). - -* **Jul 23, 2024:** 🍺🍺🍺 We release a brand new version benchmark paper: +- **Jul 23, 2024:** 🍺🍺🍺 We release a brand new version benchmark paper: [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2). @@ -67,13 +34,21 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates (\* denotes equal contribution, 📧 denotes corresponding author.) -
-Previous News +
+ comp +
+ + Instead of focusing on the best practice, We modularly and fairly benchmark LLM quantization considering calibration data, algorithms, and data formats. With detailed observation and analysis, we provide various types of novel points for performance and method improvements under different configurations. With the powerful toolkit LLMC and comprehensive insights, future LLM researchers can efficiently integrate suitable algorithms and low-bit formats for their applications, thereby democratizing the compression of large language models. - **Jul 16, 2024:** 🔥We support Wanda/Naive(Magnitude) for llm sparsification and layer-wise mix bits quantization now! - **Jul 14, 2024:** 🔥We support rotation based quantization QuaRot now! +- **Jul 4, 2024:** 📱 We open our discussion channel. If you have any questions, please join our community: + + - [Discord Server](https://discord.gg/qZKUDfhm) + - [Tencent QQ Group](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) + - **May 17, 2024:** 🚀 We support some advanced large models, e.g., LLaVA, Mixtral, LLaMA V3 and Qwen V2 now. Have a try! - **May 13, 2024:** 🍺🍺🍺 We release our quantization benchmark paper: @@ -94,23 +69,157 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates - **Mar 7, 2024:** 🚀 We release the quantization part of a powerful and efficient LLM compression tool. Notably, our benchmark paper is coming soon😊. -
- ## Highlight Feature -- 💥**Comprehensive Algorithm Support**: Provides a broad range of ✨`SOTA compression algorithms`, including ✅quantization, ✅mixed-precision quantization, and ✅sparsity, while maintaining accuracy consistent with the original repositories. ✨`Quantization best practices` (see 🚀`Best Practices` [here](https://llmc-en.readthedocs.io/en/latest/)) are also available to ensure optimal performance and efficiency. - -- 💥**Supported Formats**: Supports both ✨`quantization` (integer and floating-point) and ✨`sparsity`, specifically including ✅weight-activation, ✅weight-only, ✅mixed-precision quantization, as well as ✅structured and ✅unstructured sparsity. - -- 💥**Wide Model Support**: Offers support for a diverse array of ✨`LLM models`, including ✅LLama, ✅Mistral, ✅InternLM2, ✅Qwen2, among others, as well as ✅MOE and ✅VLM models (see [Supported Model List](#supported-model-list)). - -- 💥**Multi-backend Compatibility**: Seamlessly integrates with various backends for enhanced deployment flexibility. Multiple quantization settings and model formats are compatible with a wide range of backends and hardware platforms, such as ✅VLLM, ✅Sglang, ✅LightLLM, ✅MLC-LLM, and ✅AutoAWQ, making it highly versatile(see Section `Backend` [here](https://llmc-en.readthedocs.io/en/latest/)). - -- 💥**Performance Efficiency**: Enables quantization of large LLMs, such as ✨`Llama3.1-405B` and ✨`OPT-175B`, with PPL evaluation on a `single A100/H100/H800 GPU`. +- Quantize LLMs, e.g., Llama2-70B, OPT-175B, and evaluate their PPL on only one A100/H100/H800 GPU💥. +- SOTA compression algorithms [align with the origin repos](benchmark/align.md), for users to choose from, and users can sequentially employ multiple algorithms on one LLM💥. +- Transformed model (`save_trans` mode in `quant` part in [Configuration](#configuration)) exported by our tool with a specifical compression algorithm can go through naive quantization by multiple backends, e.g., [Lightllm](https://github.com/ModelTC/lightllm), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) to get a specifical-compression-algorithm-optimized model, which the corresponding backend can infer 💥. +- Our compressed model (`save_lightllm` mode in `quant` part in [Configuration](#configuration)) with a shallow memory footprint can be directly inferred by [Lightllm](https://github.com/ModelTC/lightllm)💥. ## Usage -Please refer to the 🚀`Quick Start` section in the [documentation](https://llmc-en.readthedocs.io/en/latest/). +1. Clone this repository and install packages: + + ```shell + # install packages + cd llmc + pip install -r requirements.txt + ``` + +2. Prepare models and data. + + ```shell + # After downloading LLMs from huggingface, prepare calibration and evaluation data as follows: + cd tools + python download_calib_dataset.py --save_path [calib data path] + python download_eval_dataset.py --save_path [eval data path] + ``` + +3. Choose an algorithm to quantize your model: + + ```shell + # Here's an example about Awq: + cd scripts + # Modify the path of llmc, ``llmc_path``, in the bash file. You can also choose one config + # placed in ``llmc/configs/quantization/Awq/`` to quantize your model, or your own + # config referring to those we provide by changing the ``--config`` argument in run_awq_llama.sh. + bash run_awq_llama.sh + ``` + +## Configuration + +To help users design their configs, we now explain some universal configurations in all configs we provide under `llmc/configs/`: + +- `model`: + + ```yaml + model: + # Replace by the name of the class in ``llmc/models/*.py``. + type: Llama + # Replace by the path of your model. + path: model path + torch_dtype: auto + ``` + +- `calib`: + + ```yaml + # Note: some algorithms do not need ``calib``, like naive... So, you can remove this part. + calib: + # Replace by the calibration data name, e.g., pileval, c4, wikitext2, or ptb, downloaded before. + name: pileval + download: False + # Replace by the path of one of the calibration data, e.g., pileval, c4, wikitext2, or ptb, + # downloaded before. + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + # Replace by the function name in ``llmc/data/dataset/specified_preproc.py``. + preproc: general + seed: *seed + ``` + +- `eval`: + + ```yaml + # If you want to evaluate PPL of your pretrained/transformed/fake_quant model. + eval: + # You can evaluate the pretrain, transformed, fake_quant model, and set the position + # you want to evaluate. + eval_pos: [pretrain, transformed, fake_quant] + # Replace by the name of the eval data, e.g., c4, wikitext2, ptb or [c4, wikitext2], + # downloaded before. + name: wikitext2 + download: False + path: eval data path + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + bs: 1 + inference_per_block: False + seq_len: 2048 + ``` + +- `save`: + + ```yaml + save: + # ``save_trans`` is True, which means you want to export the transformed model, e.g., parameter-modified + # model, whose performance and structure are the same as the original model, and users can + # utilize naive quantization to the transformed model to obtain the same performance as + # the specifical-algorithm-quantized model. + save_trans: False + # ``save_lightllm`` is True, which means you want to export a real quant model, e.g., + # low-bit weights with weight and activation quantization parameters. + save_lightllm: False + # ``save_fake`` is True means you want to export fake_quant model, e.g., + # dequantized weight with activation quantization parameters. + save_fake: False + save_path: ./save + ``` + +- `quant`: + + ```yaml + quant: + # Replace by the class name in ``llmc/compression/quantization/*.py`` + method: OmniQuant + # weight-only quantization does not have ``act`` part. + weight: + bit: 8 + symmetric: True + # Quantization granularity: per_channel, per_tensor, per_head (not recommended). + granularity: per_channel + group_size: -1 + # Calibration algorithms: learnble, mse, and minmax (default). + calib_algo: learnable + # Utilize Stright-Through Estimation, which is necessary for learnable + # calibration algorithms. + ste: True + act: + bit: 8 + symmetric: True + # Quantization granularity: per_token, per_tensor + granularity: per_token + ste: True + # Static quantization (quantization during calibration)or dynamic + # quantization (quantization during inference). + static: True + # This part is designed for specific algorithms, users can refer to + # those we provide to design their own. + special: + let: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + # If quant_out is True, employ the outputs of the former quantized block as the + # calibration data of the proceeding block. + quant_out: True + ``` ## Supported Model List @@ -138,34 +247,8 @@ Please refer to the 🚀`Quick Start` section in the [documentation](https://llm ✅ [LLaVA](https://github.com/haotian-liu/LLaVA) -✅ [InternLM2.5](https://huggingface.co/internlm) - -✅ [StableLM](https://github.com/Stability-AI/StableLM) - -✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2) - -✅ [Phi2](https://huggingface.co/microsoft/phi-2) - -✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5) - -✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM) - -✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - You can add your own model type referring to files under `llmc/models/*.py`. -## Supported Backend List - -✅ [VLLM](https://github.com/vllm-project/vllm) - -✅ [LightLLM](https://github.com/ModelTC/lightllm) - -✅ [Sglang](https://github.com/sgl-project/sglang) - -✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm) - -✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) - ## Supported Algorithm List ### Quantization @@ -225,7 +308,6 @@ We develop our code referring to the following repos: - https://github.com/mobiusml/hqq - [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot) - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda) -- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) ## Star History diff --git a/README_ja.md b/README_ja.md index ed69b3f0d..b093271a8 100644 --- a/README_ja.md +++ b/README_ja.md @@ -1,114 +1,209 @@ -# LLMC: 正確で効率的なLLM圧縮に向けて +# llmc: 正確で効率的なLLM圧縮に向けて llmc -[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![ライセンス](https://img.shields.io/badge/ライセンス-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![arXiv](https://img.shields.io/badge/LLMC-2405.06001-b31b1b)](https://arxiv.org/abs/2405.06001) -[![GitHub Stars](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc) -![visitors](https://komarev.com/ghpvc/?username=llmc&label=visitors) -[![Discord Banner](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm) +[![GitHub スター](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc) +![訪問者](https://komarev.com/ghpvc/?username=llmc&label=visitors) +[![Discord バナー](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm) [![QQ](https://img.shields.io/badge/QQ-EB1923?logo=tencent-qq&logoColor=white)](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) [![Doc](https://img.shields.io/badge/docs-English-99cc2)](https://llmc-en.readthedocs.io/en/latest/) [![Doc](https://img.shields.io/badge/文档-中文-99cc2)](https://llmc-zhcn.readthedocs.io/en/latest/) -**\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]** +**\[ [English](README.md) | [中文](README_zh.md) | 日本語 \]** -**LLMC** は、大規模言語モデル(LLM)の圧縮を目的とした、最新の圧縮アルゴリズムを活用して、パフォーマンスを損なうことなく効率を向上させ、モデルサイズを削減するためのツールです。 +**llmc** は、最先端の圧縮アルゴリズムを活用して、パフォーマンスを損なうことなく効率を向上させ、モデルサイズを削減することを目的とした、オフ・ザ・シェルフのツールです。 -**英語のドキュメント**は[こちら](https://llmc-en.readthedocs.io/en/latest/)。 +**英語のドキュメント**は[こちら](https://llmc-en.readthedocs.io/en/latest/)です。 -**中国語のドキュメント**は[こちら](https://llmc-zhcn.readthedocs.io/en/latest/)。 - -**Docker Hub**は[こちら](https://hub.docker.com/r/llmcompression/llmc)。 - -**aliyun docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]` - -以下のコマンドを使用して、llmcを実行できるDockerイメージをダウンロードできます。中国大陸のユーザーは、阿里云Dockerを使用することを推奨します。 - -docker hub - -``` -docker pull llmcompression/llmc:pure-latest -``` - -阿里云Docker - -``` -docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest -``` +**中国語のドキュメント**は[こちら](https://llmc-zhcn.readthedocs.io/en/latest/)です。 **コミュニティ**: -- [Discordサーバー](https://discord.gg/qZKUDfhm) -- [Tencent QQグループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) - -## 最新情報 - -- **2024年9月26日:** 🔥 `LLMC`からの✨ `FP8量子化(E4M3、E5M2)`モデルを、VLLMやSGLangのような高度な推理バックエンドにエクスポートできるようになりました。🚀 詳細な使用方法については、[VLLMのドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html)と[SGLangのドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html)を参照してください。 - -- **2024年9月24日:** 🔥 私たちは正式に ✨`Llama-3.1-405B` の ✅INT4 と ✅INT8 モデルをリリースしました。これらは 🚀`LLMC` の `save_lightllm` モードを使用して量子化されています。モデルパラメータは[こちら](https://huggingface.co/Dongz/llama31-405b-quant)からダウンロードできます。 +- [Discord サーバー](https://discord.gg/qZKUDfhm) +- [Tencent QQ グループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) -- **2024年9月23日:** 🔥 私たちは、🚀`LLMC` から ✨`実際の量子化された(INT4, INT8)` モデルを、 [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [MLC-LLM](https://github.com/mlc-ai/mlc-llm) などの高度な推論バックエンドにエクスポートするサポートを追加しました。これにより、✨`メモリ使用量の削減` と ✨`推論速度の向上` が可能になります。 - 詳細については、[VLLMドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html)、[SGLangドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html)、[AutoAWQドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html)、および [MLC-LLMドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html) を参照してください。 +## ニュース -- **2024年9月9日:** 🔥 パフォーマンス向上のためのベストプラクティス構成をいくつか提供しています(ベストプラクティスは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください)。 +- **2024 年 7 月 23 日:** 🍺🍺🍺 新しいバージョンのベンチマーク ペーパーをリリースします: -- **2024年9月3日:** 🔥 私たちは、[opencompass](https://github.com/open-compass/opencompass) を使用して 🚀`LLMC` モデルを評価するサポートを提供しています。この[ドキュメント](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v2.html)に従って試してみてください! + [**LLMC: 多用途の圧縮ツールキットを使用した大規模言語モデル量子化のベンチマーク**](https://arxiv.org/abs/2405.06001v2)。 -- **2024年8月22日:** 🔥私たちは現在のSOTAモデル [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) を含む多くの小型言語モデルをサポートしています([サポートされているモデルリスト](#supported-model-list)を参照してください)。 + [Ruihao Gong\*](https://xhplus.github.io/)、[Yang Yong\*](https://github.com/helloyongyang)、[Shiqiao Gu\*](https://github.com/gushiqiao)、[Yushi Huang\*](https://github.com/Harahan)、[Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en)、[Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en)、[Xianglong Liu📧](https://xlliu-beihang.github.io/)、[Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en) -- **2024年8月22日:** 🔥また、修正された [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) を使用した下流タスクの評価もサポートしています 🤗。具体的には、`save_trans` モードを使用して([構成](https://llmc-en.readthedocs.io/en/latest/configs.html)の `save` 部分を参照)変換されたモデルを保存し、その後、[run_lm_eval.sh](scripts/run_lm_eval.sh) を参照して量子化されたモデルを直接評価できます。詳細は[こちら](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v1.html)をご覧ください。 + (\* は同等の貢献、📧 は対応する貢献を表します著者。) -- **2024年7月23日:** 🍺🍺🍺 新しいベンチマーク論文をリリースしました: - - [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2)。 +
+ comp +
- [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en) + ベストプラクティスに焦点を当てるのではなく、キャリブレーションデータ、アルゴリズム、データ形式を考慮して、LLM量子化をモジュール式かつ公平にベンチマークします。詳細な観察と分析により、さまざまな構成でパフォーマンスと方法を改善するためのさまざまなタイプの新しいポイントを提供します。強力なツールキットLLMCと包括的な洞察により、将来のLLM研究者は、アプリケーションに適したアルゴリズムと低ビット形式を効率的に統合し、大規模な言語モデルの圧縮を民主化できます。 - (\*は同等の貢献を示し、📧は対応する著者を示します。) +- **2024年7月16日:** 🔥現在、llmのスパース化と層間混合ビット量子化のためのWanda/Naive(Magnitude)をサポートしています! -
-過去のニュース +- **2024年7月14日:** 🔥現在、回転ベースの量子化QuaRotをサポートしています! -- **2024年7月16日:** 🔥私たちはLLMの疎化のためのWanda/Naive(マグニチュード)および層ごとの混合ビット量子化のサポートを追加しました! +- **2024年7月4日:** 📱 ディスカッションチャンネルを開設しました。質問がある場合は、コミュニティに参加してください: -- **2024年7月14日:** 🔥私たちは回転ベースの量子化 QuaRot のサポートを追加しました! + - [Discord サーバー](https://discord.gg/qZKUDfhm) + - [Tencent QQ グループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgkUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) -- **2024年5月17日:** 🚀 私たちは、LLaVA、Mixtral、LLaMA V3、Qwen V2などのいくつかの高度な大規模モデルをサポートしています。お試しください! +- **2024年5月17日:** 🚀 現在、LLaVA、Mixtral、LLaMA V3、Qwen V2などの高度な大規模モデルをサポートしています。試してみてください! -- **2024年5月13日:** 🍺🍺🍺 私たちは量子化ベンチマーク論文をリリースしました: +- **2024年5月13日:** 🍺🍺🍺 量子化ベンチマーク論文を発表しました: - [**LLM-QBench: A Benchmark Towards the Best Practice for Post-training Quantization of Large Language Models**](https://arxiv.org/abs/2405.06001)。 + [**LLM-QBench: 大規模言語モデルのポストトレーニング量子化のベストプラクティスに向けたベンチマーク**](https://arxiv.org/abs/2405.06001). [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en) - (\*は同等の貢献を示し、📧は対応する著者を示します。) + (\* は同等の貢献を示し、📧 は対応する著者を示します。)
comp
- 私たちは、校正コスト、推論効率、量子化精度を考慮して、量子化技術を公正にベンチマークしました。さまざまなモデルとデータセットに関して600件近い実験を行い、校正データ、アルゴリズムパイプライン、および量子化構成の選択に関する3つの洞察を得ました。これらの洞察に基づいて、LLMの後処理量子化パイプラインに対するベストプラクティスが設計され、さまざまなシナリオでのパフォーマンスと効率のバランスを実現します。 - -- **2024年3月7日:** 🚀 私たちは強力で効率的なLLM圧縮ツールの量子化部分をリリースしました。なお、ベンチマーク論文は近日中に公開予定です😊。 + 校正コスト、推論効率、および量子化精度を考慮して、量子化技術をモジュール化し、公平にベンチマークしました。多様なモデルとデータセットでの約600の実験が、校正データ、アルゴリズムパイプライン、および量子化構成の選択に関する3つの洞察を提供します。これらの洞察に基づいて、LLM PTQパイプラインのベストプラクティスが設計され、さまざまなシナリオで最高の精度と効率のパフォーマンスバランスを実現します。 -
+- **2024年3月7日:** 🚀 強力で効率的なLLM圧縮ツールの量子化部分をリリースしました。注目すべきは、ベンチマーク論文が近日公開予定です😊。 -## 主要機能 +## ハイライト機能 -- 💥**包括的なアルゴリズムサポート**: 広範な ✨`SOTA圧縮アルゴリズム` をサポートし、✅量子化、✅混合精度量子化、✅疎性を含み、元のリポジトリと同じ精度を維持します。✨`量子化ベストプラクティス`(ベストプラクティスは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください)も提供されており、最適なパフォーマンスと効率を確保します。 - -- 💥**サポートされているフォーマット**: ✨`量子化`(整数および浮動小数点)と ✨`疎性` の両方をサポートし、具体的には ✅重量-活性化、✅重量のみ、✅混合精度量子化、および ✅構造化疎性 と ✅非構造化疎性 を含みます。 - -- 💥**広範なモデルサポート**: 多様な ✨`LLMモデル` をサポートしており、✅LLama、✅Mistral、✅InternLM2、✅Qwen2 など、さらに ✅MOE モデルや ✅VLM モデルもサポートしています([サポートされているモデルリスト](#supported-model-list)を参照してください)。 - -- 💥**マルチバックエンドの互換性**: 複数のバックエンドとシームレスに統合し、展開の柔軟性を強化します。さまざまな量子化設定およびモデルフォーマットが、✅VLLM、✅Sglang、✅LightLLM、✅MLC-LLM、✅AutoAWQ など、幅広いバックエンドおよびハードウェアプラットフォームと互換性があり、高い柔軟性を実現しています(`Backend`セクションは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください)。 - -- 💥**パフォーマンス効率**: ✨`Llama3.1-405B` や ✨`OPT-175B` などの大規模LLMの量子化をサポートし、`単一の A100/H100/H800 GPU` でPPL評価を可能にします。 +- LLMs(例:Llama2-70B、OPT-175B)を量子化し、1つのA100/H100/H800 GPUでPPLを評価します💥。 +- ユーザーが選択できる最先端の圧縮アルゴリズムが[元のリポジトリと一致](benchmark/align.md)し、ユーザーは1つのLLMで複数のアルゴリズムを順次使用できます💥。 +- 特定の圧縮アルゴリズムでツールによってエクスポートされた変換モデル([構成](#構成)の`quant`部分の`save_trans`モード)は、複数のバックエンド(例:[Lightllm](https://github.com/ModelTC/lightllm)、[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM))によって単純な量子化を行い、特定の圧縮アルゴリズムで最適化されたモデルを取得できます。対応するバックエンドが推論できます💥。 +- 浅いメモリフットプリントを持つ圧縮モデル([構成](#構成)の`quant`部分の`save_lightllm`モード)は、[Lightllm](https://github.com/ModelTC/lightllm)によって直接推論できます💥。 ## 使用方法 -使用ガイドは 🚀`Quick Start`セクション[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください。 +1. このリポジトリをクローンし、パッケージをインストールします: + + ```shell + # パッケージをインストール + cd llmc + pip install -r requirements.txt + ``` + +2. モデルとデータを準備します。 + + ```shell + # huggingfaceからLLMをダウンロードした後、次のように校正データと評価データを準備します: + cd tools + python download_calib_dataset.py --save_path [校正データパス] + python download_eval_dataset.py --save_path [評価データパス] + ``` + +3. アルゴリズムを選択してモデルを量子化します: + + ```shell + # これはAwqに関する例です: + cd scripts + # bashファイル内のllmcのパス``llmc_path``を変更します。``llmc/configs/quantization/Awq/``に配置された構成の1つを選択してモデルを量子化するか、run_awq_llama.shの``--config``引数を変更して提供された構成を使用します。 + bash run_awq_llama.sh + ``` + +## 構成 + +ユーザーが構成を設計するのを支援するために、`llmc/configs/`の下に提供されているすべての構成のいくつかの一般的な構成を説明します: + +- `model`: + + ```yaml + model: + # ``llmc/models/*.py``のクラス名に置き換えます。 + type: Llama + # モデルのパスに置き換えます。 + path: model path + torch_dtype: auto + ``` + +- `calib`: + + ```yaml + # 注意:一部のアルゴリズムには``calib``が必要ありません。例:naive... したがって、この部分を削除できます。 + calib: + # 以前にダウンロードした校正データ名に置き換えます。例:pileval、c4、wikitext2、またはptb。 + name: pileval + download: False + # 以前にダウンロードした校正データの1つのパスに置き換えます。例:pileval、c4、wikitext2、またはptb。 + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + # ``llmc/data/dataset/specified_preproc.py``の関数名に置き換えます。 + preproc: general + seed: *seed + ``` + +- `eval`: + + ```yaml + # 事前トレーニング/変換/偽量子化モデルのPPLを評価したい場合。 + eval: + # 事前トレーニング、変換、偽量子化モデルを評価し、評価したい位置を設定できます。 + eval_pos: [pretrain, transformed, fake_quant] + # 以前にダウンロードした評価データの名前に置き換えます。例:c4、wikitext2、ptb、または[c4, wikitext2]。 + name: wikitext2 + download: False + path: eval data path + # 70Bモデルの評価の場合、bsを20に設定し、inference_per_blockをTrueに設定できます。 + # 7B / 13Bモデルの評価の場合、bsを1に設定し、inference_per_blockをFalseに設定できます。 + bs: 1 + inference_per_block: False + seq_len: 2048 + ``` + +- `save`: + + ```yaml + save: + # ``save_trans``がTrueの場合、変換モデル(例:パラメータが変更されたモデル)をエクスポートしたいことを意味します。パフォーマンスと構造は元のモデルと同じであり、ユーザーは単純な量子化を使用して、特定のアルゴリズムで量子化されたモデルと同じパフォーマンスを得ることができます。 + save_trans: False + # ``save_lightllm``がTrueの場合、実際の量子化モデル(例:低ビットの重みと重みおよびアクティベーションの量子化パラメータ)をエクスポートしたいことを意味します。 + save_lightllm: False + # ``save_fake``がTrueの場合、偽量子化モデル(例:量子化解除された重みとアクティベーションの量子化パラメータ)をエクスポートしたいことを意味します。 + save_fake: False + save_path: ./save + ``` + +- `quant`: + + ```yaml + quant: + # ``llmc/compression/quantization/*.py``のクラス名に置き換えます。 + method: OmniQuant + # 重みのみの量子化には``act``部分がありません。 + weight: + bit: 8 + symmetric: True + # 量子化の粒度:per_channel、per_tensor、per_head(推奨されません)。 + granularity: per_channel + group_size: -1 + # 校正アルゴリズム:learnble、mse、およびminmax(デフォルト)。 + calib_algo: learnable + # ストレートスルー推定を使用します。これは、学習可能な校正アルゴリズムに必要です。 + ste: True + act: + bit: 8 + symmetric: True + # 量子化の粒度:per_token、per_tensor + granularity: per_token + ste: True + # 静的量子化(校正中の量子化)または動的量子化(推論中の量子化)。 + static: True + # この部分は特定のアルゴリズム用に設計されており、提供されているものを参考にして独自のアルゴリズムを設計できます。 + special: + let: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + # quant_outがTrueの場合、前の量子化ブロックの出力を次のブロックの校正データとして使用します。 + quant_out: True + ``` ## サポートされているモデルリスト @@ -136,33 +231,7 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates ✅ [LLaVA](https://github.com/haotian-liu/LLaVA) -✅ [InternLM2.5](https://huggingface.co/internlm) - -✅ [StableLM](https://github.com/Stability-AI/StableLM) - -✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2) - -✅ [Phi2](https://huggingface.co/microsoft/phi-2) - -✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5) - -✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM) - -✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - -独自のモデルタイプを追加するには、`llmc/models/*.py` ファイルを参照してください。 - -## サポートされているバックエンドリスト - -✅ [VLLM](https://github.com/vllm-project/vllm) - -✅ [LightLLM](https://github.com/ModelTC/lightllm) - -✅ [Sglang](https://github.com/sgl-project/sglang) - -✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm) - -✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) +`llmc/models/*.py`の下のファイルを参照して、独自のモデルタイプを追加できます。 ## サポートされているアルゴリズムリスト @@ -198,9 +267,9 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates ✅ [QuaRot](https://arxiv.org/abs/2404.00456) -### プルーニング(剪定) +### 剪定 -✅ Naive(マグニチュード) +✅ Naive(Magnitude) ✅ [Wanda](https://arxiv.org/abs/2306.11695) @@ -223,7 +292,6 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates - https://github.com/mobiusml/hqq - [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot) - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda) -- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) ## スター履歴 diff --git a/README_zh.md b/README_zh.md index e8ed8a4a5..4732f561a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -1,114 +1,211 @@ -# LLMC: 准确高效的LLM压缩工具 +# llmc:向精确高效的大型语言模型压缩迈进 llmc -[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![许可证](https://img.shields.io/badge/许可证-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![arXiv](https://img.shields.io/badge/LLMC-2405.06001-b31b1b)](https://arxiv.org/abs/2405.06001) -[![GitHub Stars](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc) -![visitors](https://komarev.com/ghpvc/?username=llmc&label=visitors) +[![GitHub 星标](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc) [![Discord Banner](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm) [![QQ](https://img.shields.io/badge/QQ-EB1923?logo=tencent-qq&logoColor=white)](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) [![Doc](https://img.shields.io/badge/docs-English-99cc2)](https://llmc-en.readthedocs.io/en/latest/) [![Doc](https://img.shields.io/badge/文档-中文-99cc2)](https://llmc-zhcn.readthedocs.io/en/latest/) -**\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]** +**\[ [English](https://github.com/ModelTC/llmc?tab=readme-ov-file#llmc-towards-accurate-and-efficient-llm-compression) | 中文 | [日本語](README_ja.md) \]** -**LLMC** 是一个开箱即用的工具,专为压缩LLM设计,利用最先进的压缩算法提高效率并减少模型体积,同时不影响预测精度。 +**llmc** 是一个即插即用的工具,旨在通过最先进的压缩算法进行大型语言模型的压缩,以提高效率并减小模型大小,同时不牺牲性能。 -**英文文档**在[此处](https://llmc-en.readthedocs.io/en/latest/)。 +**英文文档**在[这里](https://llmc-en.readthedocs.io/en/latest/). -**中文文档**在[此处](https://llmc-zhcn.readthedocs.io/en/latest/)。 - -**docker hub**在[此处](https://hub.docker.com/r/llmcompression/llmc)。 - -**阿里云docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]` - -你可以通过以下命令下载可以运行llmc的docker镜像,中国大陆用户推荐使用阿里云docker。 - -docker hub - -``` -docker pull llmcompression/llmc:pure-latest -``` - -阿里云docker - -``` -docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest -``` +**中文文档**在[这里](https://llmc-zhcn.readthedocs.io/en/latest/). **社区**: -- [Discord 服务器](https://discord.gg/qZKUDfhm) -- [腾讯QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) - -## 最新消息 - -- **2024年9月26日:** 🔥 我们现在支持从🚀 `LLMC`导出💥 `FP8 量化(E4M3,E5M2)`模型到一些先进的推理后端,例如[VLLM](https://github.com/vllm-project/vllm)和[SGLang](https://github.com/sgl-project/sglang)。关于详细使用方法,请参阅[VLLM文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html)和[SGLang文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html)。 +- [Discord群](https://discord.gg/qZKUDfhm) +- [QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) -- **2024年9月24日:** 🔥 我们正式发布了 ✨`Llama-3.1-405B` 的 ✅INT4 和 ✅INT8 模型,这些模型通过 🚀`LLMC` 使用 `save_lightllm` 模式进行量化。你可以在[此处](https://huggingface.co/Dongz/llama31-405b-quant)下载模型参数。 +## 新闻 -- **2024年9月23日:** 🔥 我们现在支持从 🚀`LLMC` 导出 ✨`真正量化的(INT4, INT8)` 模型到高级推理后端,例如 [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), 和 [MLC-LLM](https://github.com/mlc-ai/mlc-llm) 用于量化推理部署,从而实现 ✨`减少内存使用` 和 ✨`加快推理速度`。 - 详细使用方法,请参考 [VLLM 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html)、[SGLang 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html)、[AutoAWQ 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/autoawq.html) 和 [MLC-LLM 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/mlcllm.html)。 +- **2024 年 7 月 23 日:** 🍺🍺🍺 我们发布了全新版本的基准论文: -- **2024年9月9日:** 🔥 我们提供了一些最佳实践配置,帮助提升性能(参见最佳实践[此处](https://llmc-zhcn.readthedocs.io/en/latest/))。 - -- **2024年9月3日:** 🔥 我们支持通过[opencompass](https://github.com/open-compass/opencompass) 评估 🚀`LLMC` 模型。请参考此[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v2.html)试用! - -- **2024年8月22日:** 🔥我们支持许多小型语言模型,包括当前SOTA的 [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)(参见[支持的模型列表](#supported-model-list))。 - -- **2024年8月22日:** 🔥此外,我们还支持通过我们修改的 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 进行下游任务评估 🤗。具体操作,用户可以先采用 `save_trans` 模式(参见 [配置](https://llmc-zhcn.readthedocs.io/en/latest/configs.html) 中的 `save` 部分)保存权重修改后的模型。在获得转换模型后,可以直接参考 [run_lm_eval.sh](scripts/run_lm_eval.sh) 对量化模型进行评估。更多细节请见[此处](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v1.html)。 - -- **2024年7月23日:** 🍺🍺🍺 我们发布了全新的基准论文: - - [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2)。 + [**LLMC:使用多功能压缩工具包对大型语言模型量化进行基准测试**](https://arxiv.org/abs/2405.06001v2)。 [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en) (\* 表示同等贡献,📧 表示通讯作者。) -
-历史消息 +
+ comp +
-- **2024年7月16日:** 🔥我们现在支持 Wanda/Naive(幅度)进行 LLM 稀疏化和逐层混合比特量化! + 我们不关注最佳实践,而是考虑校准数据、算法和数据格式,以模块化和公平的方式对 LLM 量化进行基准测试。通过详细的观察和分析,我们为不同配置下的性能和方法改进提供了各种类型的新点。借助强大的工具包 LLMC 和全面的见解,未来的 LLM 研究人员可以有效地将合适的算法和低位格式集成到他们的应用中,从而使大型语言模型的压缩变得民主化。 -- **2024年7月14日:** 🔥我们现在支持基于旋转的量化 QuaRot! +- **2024年7月16号:** 🔥我们现在已经支持了大模型稀疏算法Wanda/Naive(Magnitude)和层间混合bit量化! -- **2024年5月17日:** 🚀 我们现在支持一些先进的大型模型,例如 LLaVA、Mixtral、LLaMA V3 和 Qwen V2。快来试试吧! +- **2024年7月14号:** 🔥我们现在已经支持了旋转类量化算法QuaRot! + +- **2024年7月4日:** 📱 我们提供了公开的讨论渠道. 如果您有任何问题,可以加入我们的社区: + + - [Discord群](https://discord.gg/qZKUDfhm) + - [QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592) - **2024年5月13日:** 🍺🍺🍺 我们发布了量化基准论文: - [**LLM-QBench: A Benchmark Towards the Best Practice for Post-training Quantization of Large Language Models**](https://arxiv.org/abs/2405.06001)。 + [**LLM-QBench:大型语言模型训练后量化的最佳实践基准**](https://arxiv.org/abs/2405.06001). [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en) - (\* 表示同等贡献,📧 表示通讯作者。) + (\* 表示共同第一作者, 📧 表示通讯作者.)
comp
- 我们模块化且公平地基准测试了量化技术,考虑了校准成本、推理效率和量化准确性。在多种模型和数据集上进行了近600次实验,得出了三个关于校准数据、算法管道和量化配置选择的有见地的结论。基于这些结论,设计了一种LLM后训练量化管道的最佳实践,以在各种场景下实现最佳的准确性和效率平衡。 - -- **2024年3月7日:** 🚀 我们发布了一个功能强大且高效的LLM压缩工具的量化部分。值得注意的是,我们的基准论文即将发布😊。 - -
- -## 亮点功能 - -- 💥**综合算法支持**: 提供广泛的 ✨`SOTA压缩算法` 支持,包括 ✅量化、✅混合精度量化 和 ✅稀疏化,同时保持与原始仓库一致的精度。我们还提供 ✨`量化最佳实践`(参见✨`最佳实践` 章节[此处](https://llmc-zhcn.readthedocs.io/en/latest/)),确保最佳性能和效率。 - -- 💥**支持的格式**: 支持 ✨`量化`(整型和浮点)和 ✨`稀疏化`,具体包括 ✅权重激活量化、✅权重量化、✅混合精度量化,以及 ✅结构化 和 ✅非结构化稀疏化。 - -- 💥**广泛模型支持**: 支持多种 ✨`LLM模型`,包括 ✅LLama、✅Mistral、✅InternLM2、✅Qwen2 等,以及 ✅MOE 和 ✅VLM 模型(参见[支持的模型列表](#supported-model-list))。 - -- 💥**多后端兼容性**: 无缝集成多个后端,增强部署灵活性。多种量化设置和模型格式兼容广泛的后端和硬件平台,例如 ✅VLLM、✅Sglang、✅LightLLM、✅MLC-LLM 和 ✅AutoAWQ,使其高度灵活(参见✨`推理后端` 章节 [此处](https://llmc-zhcn.readthedocs.io/en/latest/))。 - -- 💥**性能效率**: 支持大规模LLM的量化,例如 ✨`Llama3.1-405B` 和 ✨`OPT-175B`,并可在 `单个 A100/H100/H800 GPU` 上评估 PPL。 - -## 使用指南 - -请参阅 🚀`快速入门`章节[此处](https://llmc-zhcn.readthedocs.io/en/latest/)。 + 我们模块化并公正地基准测试了量化技术,考虑到校准成本、推理效率和量化精度。在多种模型和数据集上进行的近 600 项实验提供了三个洞见: + 关于校准数据、算法流程和量化配置选择。基于这些洞见,设计了一个最佳的大型语言模型 PTQ 流程,实现了在各种场景下最佳的精确度和效率性能平衡。 + +- **2024年3月7日:** 🚀 我们发布了强大且高效的大型语言模型压缩工具的量化部分。值得注意的是,我们的基准论文即将发布😊。 + +## 突出特性 + +- 量化大型语言模型,如 Llama2-70B、OPT-175B,并在仅一个 A100/H100/H800 GPU上评估其 PPL💥。 +- 为用户提供选择的最新的[与原论文代码仓库精度对齐](benchmark/align.md)的压缩算法,并且用户可以在一个大型语言模型上依次使用多个算法💥。 +- 由我们工具通过特定压缩算法导出的转换模型(`save_trans`模式在`quant`部分的[配置](#配置))可以通过多个后端进行简单量化,得到经过特定压缩算法优化的模型,相应的后端可以进行推断💥。 +- 我们的压缩模型(`save_lightllm`模式在`quant`部分的\[配置\](# + +配置))具有较低的内存占用,可以直接通过[Lightllm](https://github.com/ModelTC/lightllm)进行推断💥。 + +## 使用方式 + +1. 克隆此仓库并安装包: + + ```shell + # 安装包 + cd llmc + pip install -r requirements.txt + ``` + +2. 准备模型和数据。 + + ```shell + # 在从huggingface下载LLM后,按以下方式准备校准和评估数据: + cd tools + python download_calib_dataset.py --save_path [校准数据路径] + python download_eval_dataset.py --save_path [评估数据路径] + ``` + +3. 选择一个算法来量化你的模型: + + ```shell + # 这是一个关于 Awq 的例子: + cd scripts + # 修改 bash 文件中的 llmc 路径,``llmc_path``。你也可以选择``llmc/configs/quantization/Awq/``中的一个配置来量化你的模型,或者通过更改``--config``参数在 run_awq_llama.sh 中使用我们提供的配置。 + bash run_awq_llama.sh + ``` + +## 配置 + +为了帮助用户设计他们的配置,我们现在解释我们在`llmc/configs/`下提供的所有配置中的一些通用配置: + +- `model`: + + ```yaml + model: + # 用``llmc/models/*.py``中的类名替换。 + type: Llama + # 用你的模型路径替换。 + path: model path + torch_dtype: auto + ``` + +- `calib`: + + ```yaml + # 注意:一些算法不需要``calib``,如 naive... 所以,你可以移除这部分。 + calib: + # 用之前下载的校准数据名称替换,例如,pileval、c4、wikitext2 或 ptb。 + name: pileval + download: False + # 用之前下载的某个校准数据的路径替换,例如,pileval、c4、wikitext2 或 ptb。 + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + # 用``llmc/data/dataset/specified_preproc.py``中的函数名称替换。 + preproc: general + seed: *seed + ``` + +- `eval`: + + ```yaml + # 如果你想评估你的预训练/转换/假量化模型的 PPL。 + eval: + # 你可以评估预训练、转换、假量化模型,并设置你想要评估的位置。 + eval_pos: [pretrain, transformed, fake_quant] + # 用之前下载的评估数据的名称替换,例如,c4、wikitext2、ptb 或 [c4, wikitext2]。 + name: wikitext2 + download: False + path: eval data path + # 对于 70B 模型评估,bs 可以设置为 20,并且可以将 inference_per_block 设置为 True。 + # 对于 7B / 13B 模型评估,bs 可以设置为 1,并且可以将 inference_per_block 设置为 False。 + bs: 1 + inference_per_block: False + seq_len: 2048 + ``` + +- `save`: + + ```yaml + save: + # 如果``save_trans``为 True,这意味着你想要导出转换模型,例如,参数修改的模型,其性能和结构与原始模型相同,用户可以对转换模型进行简单量化,以获得与特定算法量化模型相同的性能。 + save_trans: False + # 如果``save_lightllm``为 True,这意味着你想要导出真实的量化模型,例如,低位权重和权重及激活量化参数。 + save_lightllm: False + # 如果``save_fake``为 True,意味着你想要导出假量化模型,例如,去量化的权重和激活量化参数。 + save_fake: False + save_path: ./save + + ``` + +- `quant`: + + ```yaml + quant: + # 用``llmc/compression/quantization/*.py``中的类名替换。 + method: OmniQuant + # 仅权重量化没有``act``部分。 + weight: + bit: 8 + symmetric: True + # 量化粒度:per_channel, per_tensor, per_head(不推荐)。 + granularity: per_channel + group_size: -1 + # 校准算法:learnble, mse, 以及 minmax(默认)。 + calib_algo: learnable + # 使用直通估计(Stright-Through Estimation),这对于可学习的校准算法是必需的。 + ste: True + act: + bit: 8 + symmetric: True + # 量化粒度:per_token, per_tensor + granularity: per_token + ste: True + # 静态量化(校准期间的量化)或动态量化(推理期间的量化)。 + static: True + # 这部分是为特定算法设计的,用户可以参考我们提供的算法来设计他们自己的算法。 + special: + let: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + # 如果 quant_out 为 True,使用前一个量化块的输出作为后续块的校准数据。 + quant_out: True + + ``` ## 支持的模型列表 @@ -130,39 +227,7 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates ✅ [LLaMA V3](https://huggingface.co/meta-llama) -✅ [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral) - -✅ [Qwen V2](https://github.com/QwenLM/Qwen2) - -✅ [LLaVA](https://github.com/haotian-liu/LLaVA) - -✅ [InternLM2.5](https://huggingface.co/internlm) - -✅ [StableLM](https://github.com/Stability-AI/StableLM) - -✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2) - -✅ [Phi2](https://huggingface.co/microsoft/phi-2) - -✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5) - -✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM) - -✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - -你可以参考 `llmc/models/*.py` 文件添加自己的模型类型。 - -## 支持的后端列表 - -✅ [VLLM](https://github.com/vllm-project/vllm) - -✅ [LightLLM](https://github.com/ModelTC/lightllm) - -✅ [Sglang](https://github.com/sgl-project/sglang) - -✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm) - -✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) +你可以参考 `llmc/models/*.py` 下的文件添加你自己的模型类型。 ## 支持的算法列表 @@ -200,13 +265,13 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates ### 剪枝 -✅ Naive(Magnitude) +✅ Naive(Magnitude) ✅ [Wanda](https://arxiv.org/abs/2306.11695) ✅ [ShortGPT](https://arxiv.org/abs/2403.03853) -## 鸣谢 +## 致谢 我们的代码参考了以下仓库: @@ -221,15 +286,11 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates - https://github.com/xvyaward/owq - https://github.com/TimDettmers/bitsandbytes - https://github.com/mobiusml/hqq -- [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot) - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda) -- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) - -## Star 历史 -[![Star History Chart](https://api.star-history.com/svg?repos=ModelTC/llmc&type=Timeline)](https://star-history.com/#ModelTC/llmc&Timeline) +## 星标历史 -## 引用 +[![星标历史图表](https://api.star-history.com/svg?repos=ModelTC/llmc&type=Timeline)](https://star-history.com/#ModelTC/llmc&Timeline) ## 引用 diff --git a/benchmark/align.md b/benchmark/align.md new file mode 100644 index 000000000..295cd22f0 --- /dev/null +++ b/benchmark/align.md @@ -0,0 +1,42 @@ +## Alignment with the Original Paper + +### The conda environment is consistent with the requirements.txt file and the model is LLama2-7b + +### All other configurations are aligned with the original paper/code: + +| | calib_data | seq_len | num_data | seed | +| ----------- | ---------- | ------- | -------- | ---- | +| GPTQ | c4 | 2048 | 128 | 0 | +| AWQ | pileval | 512 | 128 | 42 | +| Omniquant | wikitext2 | 2048 | 128 | 2 | +| Smoothquant | pileval | 512 | 128 | 42 | +| Os_plus | pileval | 512 | 128 | 42 | + +### Results + +#### Weight-Only Asymmetric Quantization Results + +| | w4a16g128 | w3a16g128 | w2a16g64 | +| -------------- | --------- | --------- | -------- | +| GPTQ | 5.623 | 6.318 | 14.968 | +| GPTQ-LLMC | 5.623 | 6.318 | 14.968 | +| AWQ | 5.601 | 6.243 | 2.16e5 | +| AWQ-LLMC | 5.601 | 6.238 | 2.16e5 | +| Omniquant | 5.590 | 6.092 | 9.525 | +| Omniquant-LLMC | 5.590 | 6.092 | 9.525 | + +#### Weight-Activation Asymmetric Quantization Results + +| | w8a8 | w6a6 | w4a4 | +| -------------- | ----- | ----- | ------ | +| Omniquant | 5.491 | 5.703 | 12.212 | +| Omniquant-LLMC | 5.490 | 5.703 | 12.239 | + +#### Weight-Activation Symmetric Quantization Results + +| | w8a8 | +| ---------------- | ----- | +| SmoothQuant | 5.589 | +| SmoothQuant-LLMC | 5.589 | +| Os_plus | 5.511 | +| Os_plus-LLMC | 5.517 | diff --git a/benchmark/calib.md b/benchmark/calib.md new file mode 100644 index 000000000..ddef3c41d --- /dev/null +++ b/benchmark/calib.md @@ -0,0 +1,53 @@ +## Impact of calibration data + +### Setting 1: w4a16g128 llama2-7b seq_len=512 + +#### Calibrate with wikitext2 + +| | wikitext2 | c4 | ptb | +| --------- | --------- | ----- | ------ | +| GPTQ | **5.575** | 7.470 | 63.575 | +| AWQ | **5.595** | 7.444 | 35.167 | +| OmniQuant | **5.586** | 7.455 | 34.192 | + +#### Calibrate with c4 + +| | wikitext2 | c4 | ptb | +| --------- | --------- | --------- | ------- | +| GPTQ | 5.615 | **7.443** | 122.070 | +| AWQ | 5.596 | **7.436** | 33.148 | +| OmniQuant | 5.620 | 7.457 | 34.001 | + +#### Calibrate with pileval + +| | wikitext2 | c4 | ptb | +| --------- | --------- | ----- | ------ | +| GPTQ | 5.610 | 7.477 | 136.84 | +| AWQ | 5.613 | 7.438 | 33.18 | +| OmniQuant | 5.618 | 7.458 | 34.526 | + +### Setting 2: w3a16g128 llama2-7b seq_len=512 + +#### Calibrate with wikitext2 + +| | wikitext2 | c4 | ptb | +| --------- | --------- | ----- | ------- | +| GPTQ | **6.133** | 8.696 | 234.977 | +| AWQ | **6.138** | 8.272 | 38.86 | +| OmniQuant | **6.096** | 8.325 | 40.667 | + +#### Calibrate with c4 + +| | wikitext2 | c4 | ptb | +| --------- | --------- | --------- | ------- | +| GPTQ | 6.324 | **8.385** | 358.013 | +| AWQ | 6.181 | **8.249** | 39.27 | +| OmniQuant | 6.259 | **8.317** | 41.835 | + +#### Calibrate with pileval + +| | wikitext2 | c4 | ptb | +| --------- | --------- | ----- | ------- | +| GPTQ | 6.330 | 8.534 | 263.279 | +| AWQ | 6.217 | 8.284 | 37.117 | +| OmniQuant | 6.214 | 8.320 | 42.335 | diff --git a/ci_check/awq_w4a16_fakequant_eval.yml b/ci_check/awq_w4a16_fakequant_eval.yml index 5f3700f41..c2b0cf5ca 100644 --- a/ci_check/awq_w4a16_fakequant_eval.yml +++ b/ci_check/awq_w4a16_fakequant_eval.yml @@ -20,7 +20,6 @@ eval: path: /home/runner/work/llmc/llmc/check/datasets/eval/wikitext2 bs: 1 seq_len: 16 # 2048 - eval_token_consist: True quant: method: Awq weight: diff --git a/ci_check/change_files.py b/ci_check/change_files.py index c07db9eca..25b69cf7f 100644 --- a/ci_check/change_files.py +++ b/ci_check/change_files.py @@ -40,15 +40,7 @@ def main(): "modifications": [ ( "torch.cuda.empty_cache()", - "if use_cuda: torch.cuda.empty_cache()" - ), - ( - "init_process_group(backend='nccl')", - "init_process_group(backend='gloo')" - ), - ( - "torch.cuda.set_device(int(os.environ['LOCAL_RANK']))", - "# torch.cuda.set_device(int(os.environ['LOCAL_RANK']))" + "if use_cuda: torch.cuda.empty_cache()", ) ], } @@ -89,20 +81,6 @@ def main(): ), ], } - elif file_path == "../llmc/eval/eval_base.py": - modifications = { - "header": [ - 'device_zbl = "cpu"\n', - 'use_cuda = (device_zbl != "cpu")\n', - ], - "modifications": [ - (".cuda()", ".to(device_zbl)"), - ( - "torch.cuda.empty_cache()", - "if use_cuda: torch.cuda.empty_cache()", - ), - ], - } elif file_path == "../llmc/eval/eval_ppl.py": modifications = { "header": [ @@ -119,22 +97,6 @@ def main(): ("nlls = []", "nlls = []; nsamples = nsamples_zbl"), ], } - elif file_path == "../llmc/eval/eval_token_consist.py": - modifications = { - "header": [ - 'device_zbl = "cpu"\n', - 'use_cuda = (device_zbl != "cpu")\n', - "nsamples_zbl = 1\n", - ], - "modifications": [ - (".cuda()", ".to(device_zbl)"), - ( - "torch.cuda.empty_cache()", - "if use_cuda: torch.cuda.empty_cache()", - ), - ("for i in range(0, nsamples, bs):", "for i in range(0, 1, 1):"), - ], - } else: print(f"File {file_path} not recognized or not specified for modification.") continue diff --git a/ci_check/cpu.txt b/ci_check/cpu.txt index f95571cd9..6eadabb18 100644 --- a/ci_check/cpu.txt +++ b/ci_check/cpu.txt @@ -1,7 +1,5 @@ ../llmc/compression/quantization/base_blockwise_quantization.py ../llmc/__main__.py -../llmc/eval/eval_base.py -../llmc/eval/eval_token_consist.py ../llmc/eval/eval_ppl.py ../llmc/compression/quantization/awq.py ../llmc/models/base_model.py \ No newline at end of file diff --git a/ci_check/run.sh b/ci_check/run.sh index d5ad5dcb4..24dc9da9f 100644 --- a/ci_check/run.sh +++ b/ci_check/run.sh @@ -4,22 +4,7 @@ current_directory=$(pwd) llmc=$(echo "$current_directory" | sed 's/\/ci_check$//') export PYTHONPATH=$llmc:$PYTHONPATH -config=${llmc}/ci_check/awq_w4a16_fakequant_eval.yml - -nnodes=1 -nproc_per_node=1 -MASTER_ADDR=127.0.0.1 -MASTER_PORT=$((10000 + RANDOM % 20000)) - -RANDOM=$(python -c 'import uuid; print(uuid.uuid4())') -task_id=$RANDOM cd ../scripts -torchrun \ - --nnodes $nnodes \ - --nproc_per_node $nproc_per_node \ - --rdzv_id $task_id \ - --rdzv_backend c10d \ - --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ - ${llmc}/llmc/__main__.py --config $config --task_id $task_id \ +python -m llmc --config ../ci_check/awq_w4a16_fakequant_eval.yml diff --git a/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml b/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..c88b1e8bd --- /dev/null +++ b/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml @@ -0,0 +1,37 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: c4 + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: c4_gptq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: AdaDim + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: True + granularity: per_token + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a16.yml b/configs/quantization/Awq/awq_w4a16.yml new file mode 100644 index 000000000..617bc6fa1 --- /dev/null +++ b/configs/quantization/Awq/awq_w4a16.yml @@ -0,0 +1,33 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + # eval_pos: [] + eval_pos: [pretrain, transformed] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: True + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..83113f037 --- /dev/null +++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml @@ -0,0 +1,32 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml new file mode 100644 index 000000000..246f3596f --- /dev/null +++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml @@ -0,0 +1,37 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save + tokenizer_file_substring: ["token"] diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml new file mode 100644 index 000000000..3d5b517dd --- /dev/null +++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: custom + download: False + load_from_txt: True + path: ./inputs.txt + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: random_truncate_txt + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml b/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml new file mode 100644 index 000000000..c793d25f1 --- /dev/null +++ b/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + eval_pos: [] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 +save: + save_trans: False + save_trtllm: True + trtllm_cfg: + tp_size: 1 + pp_size: 1 + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a4_best.yml b/configs/quantization/Awq/awq_w4a4_best.yml new file mode 100644 index 000000000..ddf1850b7 --- /dev/null +++ b/configs/quantization/Awq/awq_w4a4_best.yml @@ -0,0 +1,52 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + calib_algo: learnable + act: + bit: 4 + symmetric: False + granularity: per_token + calib_algo: minmax + special: + trans: True + trans_version: v2 + weight_clip: True + clip_version: v2 + save_scale: True + scale_path: scale path + save_clip: True + clip_path: clip path +save: + save_trans: False + save_quant: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w4a8_best.yml b/configs/quantization/Awq/awq_w4a8_best.yml new file mode 100644 index 000000000..bc9761c7c --- /dev/null +++ b/configs/quantization/Awq/awq_w4a8_best.yml @@ -0,0 +1,52 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + calib_algo: learnable + act: + bit: 8 + symmetric: False + granularity: per_token + calib_algo: minmax + special: + trans: True + trans_version: v2 + weight_clip: True + clip_version: v2 + save_scale: True + scale_path: scale path + save_clip: True + clip_path: clip path +save: + save_trans: False + save_quant: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml b/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml new file mode 100644 index 000000000..c5e449208 --- /dev/null +++ b/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/Awq/awq_w_only_mix_bits_1.yml b/configs/quantization/Awq/awq_w_only_mix_bits_1.yml new file mode 100644 index 000000000..da018ee82 --- /dev/null +++ b/configs/quantization/Awq/awq_w_only_mix_bits_1.yml @@ -0,0 +1,46 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + mix_bits: + setting_0: + layer_name: [down_proj] + do_quant: True + weight: + bit: 8 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save + tokenizer_file_substring: ["token"] diff --git a/configs/quantization/Awq/awq_w_only_mix_bits_2.yml b/configs/quantization/Awq/awq_w_only_mix_bits_2.yml new file mode 100644 index 000000000..237ea6410 --- /dev/null +++ b/configs/quantization/Awq/awq_w_only_mix_bits_2.yml @@ -0,0 +1,49 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + mix_bits: + setting_0: + layer_name: [down_proj#0-1-2-3-28-29-30-31] + do_quant: True + weight: + bit: 8 + symmetric: False + granularity: per_group + group_size: 128 + setting_1: + layer_name: [o_proj] + do_quant: False +save: + save_trans: False + save_path: ./save + tokenizer_file_substring: ["token"] diff --git a/configs/quantization/Awq/awq_wa_mix_bits.yml b/configs/quantization/Awq/awq_wa_mix_bits.yml new file mode 100644 index 000000000..0b0fd120e --- /dev/null +++ b/configs/quantization/Awq/awq_wa_mix_bits.yml @@ -0,0 +1,47 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: False + granularity: per_channel + act: + bit: 4 + symmetric: False + granularity: per_token + mix_bits: + setting_0: + layer_name: [down_proj] + do_quant: True + weight: + bit: 8 + symmetric: False + granularity: per_channel + act: + bit: 8 + symmetric: False + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml b/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml new file mode 100644 index 000000000..c0bd8af33 --- /dev/null +++ b/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml @@ -0,0 +1,41 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: wikitext2 + download: False + path: calib data path + n_samples: 1 + bs: 1 + seq_len: 2048 + preproc: wikitext2_gptq + seed: *seed +eval: + eval_pos: [] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: DGQ + weight: + w_1: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + w_2: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml b/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml new file mode 100644 index 000000000..6ee067efa --- /dev/null +++ b/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml @@ -0,0 +1,33 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: pileval_awq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: Awq + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml b/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..fc8663f3a --- /dev/null +++ b/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml @@ -0,0 +1,24 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml b/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..cff96da07 --- /dev/null +++ b/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml @@ -0,0 +1,27 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml b/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml new file mode 100644 index 000000000..fcf0451e0 --- /dev/null +++ b/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml @@ -0,0 +1,23 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: e2m1 + symmetric: True + granularity: per_channel +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml b/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml new file mode 100644 index 000000000..493a124f2 --- /dev/null +++ b/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml @@ -0,0 +1,24 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: e2m1 + symmetric: True + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml b/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml new file mode 100644 index 000000000..2938cd11a --- /dev/null +++ b/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml @@ -0,0 +1,27 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: e2m1 + symmetric: True + granularity: per_channel + act: + bit: e2m1 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml b/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml new file mode 100644 index 000000000..ad8e1b935 --- /dev/null +++ b/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml @@ -0,0 +1,27 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: e4m3 + symmetric: True + granularity: per_channel + act: + bit: e4m3 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml b/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml new file mode 100644 index 000000000..8e8f3cae0 --- /dev/null +++ b/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml @@ -0,0 +1,27 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + tokenizer_mode: slow + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: e5m2 + symmetric: True + granularity: per_channel + act: + bit: e5m2 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml b/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..3abfb0b98 --- /dev/null +++ b/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml @@ -0,0 +1,41 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: c4 + download: False + n_samples: 128 + path: calib data path + bs: 1 + seq_len: 2048 + preproc: c4_gptq + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: GPTQ + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + special: + actorder: False + static_groups: False + percdamp: 0.01 + blocksize: 128 + true_sequential: True + owq: True + n_outs: [6, 6, 6, 6, 2, 2, 6] #target bit is 4.01 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/GPTQ/gptq_quarot.yml b/configs/quantization/GPTQ/gptq_quarot.yml new file mode 100644 index 000000000..c753eea4a --- /dev/null +++ b/configs/quantization/GPTQ/gptq_quarot.yml @@ -0,0 +1,51 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + n_samples: 128 + path: valib data path + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4] + download: False + path: eval data path + bs: 1 + inference_per_block: False + seq_len: 2048 +quant: + method: GPTQ + weight: + bit: 6 + symmetric: False + granularity: per_channel + group_size: -1 + qmax_to_tensor: True + calib_algo: minmax + act: + bit: 6 + symmetric: False + granularity: per_token + qmax_to_tensor: True + calib_algo: minmax + special: + actorder: True + static_groups: True + percdamp: 0.01 + blocksize: 128 + true_sequential: True + online_rotate: True + fp32_had: True + quant_out: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..dc5eadef8 --- /dev/null +++ b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml @@ -0,0 +1,39 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: c4 + download: False + n_samples: 128 + path: calib data path + bs: 1 + seq_len: 2048 + preproc: c4_gptq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: GPTQ + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + special: + actorder: True + static_groups: False + percdamp: 0.01 + blocksize: 128 + true_sequential: True + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml new file mode 100644 index 000000000..ea4abee7f --- /dev/null +++ b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml @@ -0,0 +1,39 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: GPTQ + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + special: + actorder: True + static_groups: True + percdamp: 0.01 + blocksize: 128 + true_sequential: True + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml b/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..5083eb983 --- /dev/null +++ b/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml @@ -0,0 +1,30 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: HQQ + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + round_zp: False + special: + axis : 0 + lp_norm : 0.7 + beta : 10 + kappa : 1.01 + iters : 20 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml b/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..d2a5d66d8 --- /dev/null +++ b/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml @@ -0,0 +1,38 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: LlmInt8 + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: True + granularity: per_token + special: + threshold: 6.0 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml b/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..e3a211505 --- /dev/null +++ b/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml @@ -0,0 +1,38 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: NormTweaking + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + special: + ntweak_lr: 0.000001 + deactive_amp: False + epochs: 50 + gamma: 0.001 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml b/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..9624d52ce --- /dev/null +++ b/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml @@ -0,0 +1,42 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: NormTweaking + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: True + granularity: per_token + special: + ntweak_lr: 0.000001 + deactive_amp: True + epochs: 50 + gamma: 0.001 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml b/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml new file mode 100644 index 000000000..511aaf734 --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml @@ -0,0 +1,51 @@ +base: + seed: &seed 42 +model: + type: Llama + path: transformed model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 2 + symmetric: False + granularity: per_group + group_size: 64 + calib_algo: learnable + ste: True + special: + aug_loss: True + lwc: True + let: False + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 5 + wd: 0 + search_clip_init: True + search_scale_init: True + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml new file mode 100644 index 000000000..9840f97df --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml @@ -0,0 +1,49 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 2 + symmetric: False + granularity: per_group + group_size: 64 + calib_algo: learnable + ste: True + special: + aug_loss: True + lwc: True + let: False + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 40 + wd: 0 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..bbd03e56f --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml @@ -0,0 +1,49 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 + calib_algo: learnable + ste: True + special: + aug_loss: False + lwc: True + let: False + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml b/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml new file mode 100644 index 000000000..9a8bae2f2 --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml @@ -0,0 +1,59 @@ +base: + seed: &seed 42 +model: + type: Llama + path: transformed model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 4 + symmetric: False + granularity: per_channel + calib_algo: learnable + ste: True + act: + bit: 4 + symmetric: False + granularity: per_token + ste: True + special: + aug_loss: False + lwc: True + let: True + lwc_lr: 0.001 + let_lr: 0.001 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 5 + wd: 0 + search_clip_init: True + load_clip: True + search_scale_init: True + scale_path: scale path + clip_path: clip path + quant_out: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml b/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml new file mode 100644 index 000000000..267e509cb --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml @@ -0,0 +1,59 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + inference_per_block: False + # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True. + # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False. + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 4 + symmetric: False + granularity: per_channel + calib_algo: learnable + ste: True + act: + bit: 8 + symmetric: False + granularity: per_token + ste: True + special: + aug_loss: False + lwc: True + let: True + lwc_lr: 0.001 + let_lr: 0.001 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 5 + wd: 0 + search_clip_init: True + load_clip: True + search_scale_init: True + scale_path: scale path + clip_path: clip path + quant_out: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..5912c1c6d --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml @@ -0,0 +1,51 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + calib_algo: learnable + ste: True + act: + bit: 8 + symmetric: True + granularity: per_token + ste: True + special: + aug_loss: False + let: True + lwc: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..64a35e770 --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml @@ -0,0 +1,49 @@ +base: + seed: &seed 42 +model: + type: Mistral + path: models/mistral/ + torch_dtype: auto +calib: + name: pileval + download: False + path: llmc/cali_data/pileval/ + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: llmc/eval_data/ + bs: 1 + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + calib_algo: learnable + ste: True + act: + bit: 8 + symmetric: True + granularity: per_token + ste: True + special: + let: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: False + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..e88cfea78 --- /dev/null +++ b/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml @@ -0,0 +1,49 @@ +base: + seed: &seed 42 +model: + type: Opt + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OmniQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + calib_algo: learnable + ste: True + act: + bit: 8 + symmetric: True + granularity: per_token + ste: True + special: + let: True + lwc_lr: 0.01 + let_lr: 0.005 + use_shift: True + alpha: 0.5 + deactive_amp: True + epochs: 20 + wd: 0 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml b/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml new file mode 100644 index 000000000..12d4b66e3 --- /dev/null +++ b/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml @@ -0,0 +1,46 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 1 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OsPlus + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + calib_algo: learnable + act: + bit: 4 + symmetric: False + granularity: per_token + special: + use_shift: False + weight_clip: True + save_scale: True + scale_path: scale path + save_clip: True + clip_path: clip path + clip_version: v2 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml b/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml new file mode 100644 index 000000000..b5c3a6723 --- /dev/null +++ b/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml @@ -0,0 +1,46 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 1 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OsPlus + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + calib_algo: learnable + act: + bit: 8 + symmetric: False + granularity: per_token + special: + use_shift: False + weight_clip: True + save_scale: True + scale_path: scale path + save_clip: True + clip_path: clip path + clip_version: v2 + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml b/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml new file mode 100644 index 000000000..4cd1dda47 --- /dev/null +++ b/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 1 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OsPlus + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml b/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml new file mode 100644 index 000000000..7514785a7 --- /dev/null +++ b/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 42 +model: + type: Opt + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 1 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: OsPlus + weight: + bit: 8 + symmetric: False + granularity: per_channel + act: + bit: 8 + symmetric: False + granularity: per_token + quant_out: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml b/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml new file mode 100644 index 000000000..31d9f36a2 --- /dev/null +++ b/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml @@ -0,0 +1,41 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: c4 + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: c4_gptq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: QUIK + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: False + granularity: per_token + special: + fp_relative: False + fp_features: 256 + fp_threshold: 0.0 + last_fc_bit: 8 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/QuaRot/quarot_w4a4.yml b/configs/quantization/QuaRot/quarot_w4a4.yml new file mode 100644 index 000000000..0c037d762 --- /dev/null +++ b/configs/quantization/QuaRot/quarot_w4a4.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model_path + torch_dtype: auto +eval: + eval_pos: [transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + seq_len: 2048 +quant: + method: Quarot + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + qmax_to_tensor: True + calib_algo: minmax + act: + bit: 4 + symmetric: False + granularity: per_token + qmax_to_tensor: True + special: + rotate_mode: hadamard + fp32_had: True + online_rotate: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/quantization/RTN/rtn_w4a16.yml b/configs/quantization/RTN/rtn_w4a16.yml new file mode 100644 index 000000000..d9ef3b399 --- /dev/null +++ b/configs/quantization/RTN/rtn_w4a16.yml @@ -0,0 +1,16 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +quant: + method: RTN + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save_w4a16 diff --git a/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml b/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..ffc3e6875 --- /dev/null +++ b/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml @@ -0,0 +1,23 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: [wikitext2, c4, ptb] + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 128 +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/RTN/rtn_w8a8.yml b/configs/quantization/RTN/rtn_w8a8.yml new file mode 100644 index 000000000..d0315d8b7 --- /dev/null +++ b/configs/quantization/RTN/rtn_w8a8.yml @@ -0,0 +1,20 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +quant: + method: RTN + weight: + bit: 8 + symmetric: False + granularity: per_group + group_size: 128 + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save_w8a8 diff --git a/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml b/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..a516c2b8c --- /dev/null +++ b/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml @@ -0,0 +1,26 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml b/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml new file mode 100644 index 000000000..fe0e44365 --- /dev/null +++ b/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml @@ -0,0 +1,36 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: RTN + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_tensor + static: True +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..24f961bb3 --- /dev/null +++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 512 + bs: 1 + seq_len: 512 + preproc: pileval_smooth + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SmoothQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml new file mode 100644 index 000000000..98fd8937c --- /dev/null +++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 512 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SmoothQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml new file mode 100644 index 000000000..a6ff9c645 --- /dev/null +++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 512 + bs: 1 + seq_len: 512 + preproc: pileval_smooth + seed: *seed +eval: + eval_pos: [] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SmoothQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: True + save_path: ./save diff --git a/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml new file mode 100644 index 000000000..8ce109fe6 --- /dev/null +++ b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Opt + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 512 + bs: 1 + seq_len: 512 + preproc: pileval_smooth + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SmoothQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml new file mode 100644 index 000000000..81e55b0c8 --- /dev/null +++ b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: Opt + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 512 + bs: 1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [pretrain, transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SmoothQuant + weight: + bit: 8 + symmetric: True + granularity: per_channel + act: + bit: 8 + symmetric: True + granularity: per_token +save: + save_trans: False + save_path: ./save diff --git a/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml b/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml new file mode 100644 index 000000000..5d0dbea08 --- /dev/null +++ b/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml @@ -0,0 +1,54 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: c4 + download: False + n_samples: 128 + path: calib data path + bs: 1 + seq_len: 2048 + preproc: c4_gptq + seed: *seed +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +quant: + method: SpQR + weight: + bit: 4 + symmetric: False + granularity: per_group + group_size: 16 + round_zp: False + special: + actorder: True + percdamp: 1 + blocksize: 128 + true_sequential: True + relative_threshold: 0.2 + simplified_outliers: False + scale: + bit: 3 + symmetric: False + granularity: per_group + group_size: 16 + round_zp: False + zero: + bit: 3 + symmetric: False + granularity: per_group + group_size: 16 + round_zp: False + quant_out: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/quantization/SpinQuant/spinquant_w4a4.yml b/configs/quantization/SpinQuant/spinquant_w4a4.yml new file mode 100644 index 000000000..0609839d7 --- /dev/null +++ b/configs/quantization/SpinQuant/spinquant_w4a4.yml @@ -0,0 +1,63 @@ +base: + seed: &seed 0 +model: + type: Llama + path: model path + torch_dtype: auto +eval: + eval_pos: [transformed, fake_quant] + name: wikitext2 + download: False + path: eval data path + bs: 1 + inference_per_block: False + seq_len: 2048 +quant: + method: SpinQuant + weight: + bit: 4 + symmetric: False + granularity: per_channel + group_size: -1 + qmax_to_tensor: True + calib_algo: mse + ste: True + act: + bit: 4 + symmetric: False + granularity: per_token + qmax_to_tensor: True + ste: True + special: + rotate_mode: hadamard + fp32_had: True + online_rotate: True +train: + data: + name: wikitext2 + download: False + path: calib data path + n_samples: 800 + bs: 1 + seq_len: 2048 + preproc: wikitext2_gptq + seed: *seed + cache_dir: None + train_args: + fp16: False + bf16: True + log_on_each_node: False + per_device_train_batch_size: 1 + logging_steps: 1 + learning_rate: 1.5 + weight_decay: 0. + lr_scheduler_type: "cosine" + gradient_checkpointing: True + max_steps: 1 + output_dir: output_path + logging_dir: your_log_path + logging_first_step: True +save: + save_trans: False + save_fake: False + save_path: ./save diff --git a/configs/sparsification/Magnitude/magnitude.yml b/configs/sparsification/Magnitude/magnitude.yml new file mode 100644 index 000000000..e4b8957dc --- /dev/null +++ b/configs/sparsification/Magnitude/magnitude.yml @@ -0,0 +1,30 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [transformed] + name: wikitext2 + download: False + path: eval data path + bs: 1 + seq_len: 2048 +sparse: + method: Magnitude + weight: + sparsity: 0.5 +save: + save_fp: False + save_lightllm: False + save_path: ./save diff --git a/configs/sparsification/ShortGPT/shortgpt.yml b/configs/sparsification/ShortGPT/shortgpt.yml new file mode 100644 index 000000000..f651e92a4 --- /dev/null +++ b/configs/sparsification/ShortGPT/shortgpt.yml @@ -0,0 +1,30 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [transformed] + name: [wikitext2, c4] + download: False + path: eval data path + seq_len: 2048 +sparse: + method: ShortGPT + weight: + n_prune_layers: 9 +save: + save_trans: True + save_fp: False + save_lightllm: False + save_path: ./save diff --git a/configs/sparsification/Wanda/wanda.yml b/configs/sparsification/Wanda/wanda.yml new file mode 100644 index 000000000..a768242af --- /dev/null +++ b/configs/sparsification/Wanda/wanda.yml @@ -0,0 +1,31 @@ +base: + seed: &seed 42 +model: + type: Llama + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: general + seed: *seed +eval: + eval_pos: [transformed] + name: [wikitext2, c4] + download: False + path: eval data path + bs: 1 + seq_len: 2048 +sparse: + method: Wanda + weight: + sparsity: 0.5 + sparsity_out: False +save: + save_fp: False + save_lightllm: False + save_path: ./save diff --git a/docs/en/source/advanced/model_test.md b/docs/en/source/advanced/model_test.md new file mode 100644 index 000000000..76dc06f95 --- /dev/null +++ b/docs/en/source/advanced/model_test.md @@ -0,0 +1,181 @@ +# Model accuracy test + +## Accuracy test pipeline + +LLMC supports basic PPL (Perplexity) evaluation, but more downstream task evaluations are not supported by LLMC itself. + +It is common practice to use evaluation tools to directly test the inference of the model, including but not limited to: + +1. [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) + +2. [opencompass](https://github.com/open-compass/opencompass) + +However, this evaluation method is not efficient, so we recommend using the inference engine evaluation tool to separate the model accuracy evaluation, the model is inferred by the inference engine, and served in the form of an API, and the evaluation tool evaluates the API. This approach has the following benefits: + + +1. Using an efficient inference engine for model inference can speed up the entire evaluation process + +2. The reasoning of the model and the evaluation of the model are separated, and each is responsible for its own professional affairs, and the code structure is clearer + +3. Using the inference engine to infer a model is more in line with the actual deployment scenario and easier to align with the accuracy of the actual deployment of the model + +We recommend and introduce the compression-deployment-evaluation process using the following model: **LLMC compression-lightllm inference-opencompass evaluation** + + +Here are the links to the relevant tools: + +1. llmc, Large language Model Compression Tool, [(GitHub)(https://github.com/ModelTC/llmc), [Doc](https://llmc-zhcn.readthedocs.io/en/latest/)] + +2. Lightllm, Large language Model Inference Engine, [[GitHub](https://github.com/ModelTC/lightllm)] + +3. OpenCompass, Large language Model Evaluation Tool, [[GitHub]((https://github.com/open-compass/opencompass)), [Doc](https://opencompass.readthedocs.io/zh-cn/latest/)] + +## Use of the lightLLM inference engine + +The official [lightllm](https://github.com/ModelTC/llmc) repository has more detailed documentation, but here is a simple and quick start + + start a service of a float model + +**install lightllm** + +``` +git clone https://github.com/ModelTC/lightllm.git +cd lightllm +pip install -v -e . +``` + +**start a service** + +``` +python -m lightllm.server.api_server --model_dir # model path \ + --host 0.0.0.0 \ + --port 1030 \ + --nccl_port 2066 \ + --max_req_input_len 6144 \ + --max_req_total_len 8192 \ + --tp 2 \ + --trust_remote_code \ + --max_total_token_num 120000 +``` + +The above command will serve a 2-card on port 1030 of the machine + +The above commands can be set by the number of tp, and TensorParallel inference can be performed on tp cards, which is suitable for inference of larger models. + +The max_total_token_num in the above command will affect the throughput performance during the test, and can be set according to the lightllm [documentation](https://github.com/ModelTC/lightllm/blob/main/docs/ApiServerArgs.md). As long as the gpu memory is not exploded, the larger the setting, the better. + +If you want to set up multiple lightllm services on the same machine, you need to reset the port and nccl_port above without conflicts. + + Simple testing of the service + +Execute the following python script + +``` +import requests +import json + +url = 'http://localhost:1030/generate' +headers = {'Content-Type': 'application/json'} +data = { + 'inputs': 'What is AI?', + "parameters": { + 'do_sample': False, + 'ignore_eos': False, + 'max_new_tokens': 128, + } +} +response = requests.post(url, headers=headers, data=json.dumps(data)) +if response.status_code == 200: + print(response.json()) +else: + print('Error:', response.status_code, response.text) +``` + +If the above script returns normally, the service is normal + + start a service of a quantization model + +``` +python -m lightllm.server.api_server --model_dir 模型路径 \ + --host 0.0.0.0 \ + --port 1030 \ + --nccl_port 2066 \ + --max_req_input_len 6144 \ + --max_req_total_len 8192 \ + --tp 2 \ + --trust_remote_code \ + --max_total_token_num 120000 \ + --mode triton_w4a16 +``` + +Added to the command `--mode triton_w4a16`, indicates that the naive quantization of w4a16 was used + +After the service is started, you also need to verify whether the service is normal + +The model path used by the above command is the original pre-trained model and has not been adjusted by the llmc. You can follow the LLMC documentation, open the save_trans, save a modified model, and then run the naive quantization service command described above. + +## Use of the opencompass evaluation tool + +The official [opencompass](https://github.com/open-compass/opencompass) repository has more detailed documentation, but here is a simple and quick start + +**install opencompass** + +``` +git clone https://github.com/open-compass/opencompass.git +cd opencompass +pip install -v -e . +``` + +**Modify the config** + +The config file is [here](https://github.com/open-compass/opencompass/blob/main/configs/eval_lightllm.py), this configuration file is used by OpenCompass to evaluate the accuracy of Lightllm's API service, and it should be noted that the port inside it url should be consistent with the above Lightllm service port + + +For the selection of the evaluation dataset, you need to modify this part of the code + +``` +with read_base(): + from .summarizers.leaderboard import summarizer + from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets +``` + +The above code snippet, which represents the test humaneval dataset, can be found here for more dataset testing support + +**Dataset download** + +It is necessary to prepare the best dataset according to the OpenCompass [documentation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html#dataset-preparation). + +**Run accuracy tests** + +After modifying the above configuration file, you can run the following command +``` +python run.py configs/eval_lightllm.py +``` +When the model has completed the inference and metric calculations, we can get the evaluation results of the model. The output folder will be generated in the current directory, the logs subfolder will record the logs in the evaluation, and the summary subfile will record the accuracy of the measured data set + +## FAQ + +** Q1 ** + +What does the dataset configuration file in OpenCompass mean when the same dataset has different suffixes? + +** Solution ** + +Different suffixes represent different prompt templates, and for detailed OpenCompass questions, please refer to the OpenCompass documentation + +** Q2 ** + +The test accuracy of the Humaneval of the LLAMA model is too low + +** Solution ** + +You may need to delete the \n at the end of each entry in the Humaneval jsonl file in the dataset provided by OpenCompass and retest it + +** Q3 ** + +The test is still not fast enough + +** Solution ** + +You can consider whether the max_total_token_num parameter settings are reasonable when starting the lightllm service, and if the setting is too small, the test concurrency will be low + diff --git a/docs/en/source/configs.md b/docs/en/source/configs.md index eaf3baaff..2a2c0f440 100644 --- a/docs/en/source/configs.md +++ b/docs/en/source/configs.md @@ -8,46 +8,39 @@ Here's a brief config example base: seed: &seed 42 # Set random seed model: - type: model_type # Type of the model - path: model path # Path to the model - tokenizer_mode: fast # Type of the model's tokenizer - torch_dtype: auto # Data type of the model + type: Llama # Type of model + path: model path # Model path + tokenizer_mode: fast # The tokenizer type of the model + torch_dtype: auto # Model dtype calib: - name: pileval # Name of the calibration dataset - download: False # Whether to download the calibration dataset online - path: calib data path # Path to the calibration dataset - n_samples: 512 # Number of samples in the calibration dataset - bs: 1 # Batch size for the calibration dataset - seq_len: 512 # Sequence length for the calibration dataset - preproc: pileval_smooth # Preprocessing method for the calibration dataset - seed: *seed # Random seed for the calibration dataset + name: pileval # Calibration data set name + download: False # Whether the calibration dataset can be downloaded online + path: calib data path # Calibration dataset path + n_samples: 512 # Number of calibration samples + bs: 1 # Batch size of calibration dataset + seq_len: 512 # Sequence length of calibration dataset + preproc: pileval_smooth # Pre-procession of the calibration dataset + seed: *seed # Random seed for calibration dataset eval: - eval_pos: [pretrain, transformed, fake_quant] # Evaluation points - name: wikitext2 # Name of the evaluation dataset - download: False # Whether to download the evaluation dataset online - path: eval data path # Path to the evaluation dataset - bs: 1 # Batch size for the evaluation dataset - seq_len: 2048 # Sequence length for the evaluation dataset - eval_token_consist: False # Whether to evaluate the consistency of tokens between the quantized and original models + eval_pos: [pretrain, transformed, fake_quant] # eval positon + name: wikitext2 # The name of the evaluation dataset + download: False # Whether the evaluation dataset can be downloaded online + path: eval data path # Path to evaluation dataset + bs: 1 # The batch size of the evaluation dataset + seq_len: 2048 # Sequence length of the evaluation dataset quant: method: SmoothQuant # Compression method weight: - bit: 8 # Number of quantization bits for weights - symmetric: True # Whether weight quantization is symmetric - granularity: per_channel # Granularity of weight quantization + bit: 8 # The number of quantified bits of the weight + symmetric: True # Is weight quantization a symmetric quantization + granularity: per_channel # The granularity of weight quantification act: - bit: 8 # Number of quantization bits for activations - symmetric: True # Whether activation quantization is symmetric - granularity: per_token # Granularity of activation quantization - speical: # Special parameters required for the quantization algorithm. Refer to the comments in the configuration file and the original paper for usage. + bit: 8 # Number of activated quantization bits + symmetric: True # Whether activation quantization is symmetric quantization + granularity: per_token # The granularity of activation quantification save: - save_vllm: False # Whether to save the real quantized model for VLLM inference - save_sgl: False # Whether to save the real quantized model for Sglang inference - save_autoawq: False # Whether to save the real quantized model for AutoAWQ inference - save_mlcllm: False # Whether to save the real quantized model for MLC-LLM inference - save_trans: False # Whether to save the model after weight transformation - save_fake: False # Whether to save the fake quantized weights - save_path: /path/to/save # Save path + save_trans: False # Whether to save the adjusted model + save_path: ./save # Save path ``` # Configs' detailed description @@ -362,45 +355,12 @@ quant: ## save - save.save_vllm + save.save_trans -Whether to save as a [VLLM](https://github.com/vllm-project/vllm) inference backend-supported real quantized model. +Whether to save the adjusted model weights -When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the VLLM backend. This improves inference speed and reduces memory usage. For more details on the [VLLM](https://github.com/vllm-project/vllm) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html#). +The saved weight is the weight that is more suitable for quantization after adjustment, and it is still saved in the form of FP16, and when it is deployed in the inference engine, you need to enable NAIVE quantization to achieve quantitative inference - save.save_sgl + save.save_path -Whether to save as a [Sglang](https://github.com/sgl-project/sglang) inference backend-supported real quantized model. - -When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [Sglang](https://github.com/sgl-project/sglang) backend. This improves inference speed and reduces memory usage. For more details on the [Sglang](https://github.com/sgl-project/sglang) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html). - - save.save_autoawq - -Whether to save as an [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) inference backend-supported real quantized model. - -When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) backend. This improves inference speed and reduces memory usage. For more details on the [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html). - - save.save_mlcllm - -Whether to save as an [MLC-LLM](https://github.com/mlc-ai/mlc-llm) inference backend-supported real quantized model. - -When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [MLC-LLM](https://github.com/mlc-ai/mlc-llm) backend. This improves inference speed and reduces memory usage. For more details on the [MLC-LLM](https://github.com/mlc-ai/mlc-llm) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html). - - save.save_trans - -Whether to save the adjusted model weights. - -The saved weights are adjusted to be more suitable for quantization, possibly containing fewer outliers. They are still saved in fp16/bf16 format (with the same file size as the original model). When deploying the model in the inference engine, the engine's built-in `naive quantization` needs to be used to achieve quantized inference. - -Unlike `save_vllm` and similar options, this option requires the inference engine to perform real quantization, while `llmc` provides a floating-point model weight that is more suitable for quantization. - -For example, the `save_trans` models exported by algorithms such as `SmoothQuant, Os+, AWQ, and Quarot` have `fewer outliers` and are more suitable for quantization. - - - save.save_fake - -Whether to save the fake quantized model. - - save.save_path - -The path where the model is saved. This path must be a new, non-existent directory, otherwise, LLMC will terminate the run and issue an appropriate error message. \ No newline at end of file +Save the path of the model, which needs to be a new directory path that does not exist, otherwise the llmc will terminate the operation with a corresponding error message diff --git a/docs/en/source/index.rst b/docs/en/source/index.rst index ba663eb57..ea56418eb 100644 --- a/docs/en/source/index.rst +++ b/docs/en/source/index.rst @@ -29,25 +29,7 @@ arxiv: https://arxiv.org/abs/2405.06001 :maxdepth: 2 :caption: Advanced - advanced/model_test_v1.md - advanced/model_test_v2.md + advanced/model_test.md advanced/custom_dataset.md advanced/mix_bits.md advanced/sparsification.md - -.. toctree:: - :maxdepth: 2 - :caption: Best Practice - - practice/awq.md - practice/awq_omni.md - practice/quarot_gptq.md - -.. toctree:: - :maxdepth: 2 - :caption: Backbend - - backend/vllm.md - backend/sglang.md - backend/autoawq.md - backend/mlcllm.md diff --git a/docs/en/source/quickstart.md b/docs/en/source/quickstart.md index 303f26111..9e9a118c0 100644 --- a/docs/en/source/quickstart.md +++ b/docs/en/source/quickstart.md @@ -1,53 +1,48 @@ - -# Installing LLMC +# Installation of llmc ``` git clone https://github.com/ModelTC/llmc.git -cd llmc/ pip install -r requirements.txt ``` -# Preparing the Model - -**LLMC** currently supports only `hugging face` format models. For example, you can find the `Qwen2-0.5B` model [here](https://huggingface.co/Qwen/Qwen2-0.5B). Instructions for downloading can be found [here](https://zhuanlan.zhihu.com/p/663712983). +llmc does not need to be installed. To use llmc you only need to add this to the script. +``` +PYTHONPATH=[llmc's save path]:$PYTHONPATH +``` -For users in Mainland China, you can also use the [hugging face mirror](https://hf-mirror.com/). +# Prepare the model -An example of a simple download can be: +Currently, llmc only supports models in the Hugging Face format. In the case of Qwen2-0.5B, the model can be found [here](https://huggingface.co/Qwen/Qwen2-0.5B). +A simple download example can be used: ``` pip install -U hf-transfer -HF_ENDPOINT=https://hf-mirror.com HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --resume-download Qwen/Qwen2-0.5B --local-dir Qwen2-0.5B +HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --resume-download Qwen/Qwen2-0.5B --local-dir Qwen2-0.5B ``` -# Downloading the Dataset +# Download the datasets -**LLMC** requires datasets which are categorized into `calibration datasets` and `evaluation datasets`. The `calibration dataset` can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py) and the `evaluation dataset` can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py). +The datasets required by llmc can be divided into calibration datasets and eval datasets. The calibration dataset can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py), and the eval dataset can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py). -Additionally, **LLMC** supports downloading datasets online, by setting `download` to True in the `config`. +Of course, llmc also supports online download of datasets, as long as the download in the config is set to True. -```yaml -calib: - name: pileval - download: True -``` -# Setting Configuration Files +# Set Configs -All `configuration files` can be found [here](https://github.com/ModelTC/llmc/blob/main/configs/), and details on the `configuration files` can be referenced [in this section](https://llmc-en.readthedocs.io/en/latest/configs.html). For example, the SmoothQuant `config` is available [here](https://github.com/ModelTC/llmc/blob/main/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml). +In the case of smoothquant, the config is [here](https://github.com/ModelTC/llmc/blob/main/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml). -```yaml +``` base: seed: &seed 42 model: - type: Qwen2 # Set model name, supporting models like Llama, Qwen2, Llava, Gemma2, etc. - path: # Set the model weight path + type: Qwen2 # Set the model name, which can support Llama, Qwen2, Llava, Gemma2 and other models. + path: # Set model weight path. torch_dtype: auto calib: name: pileval download: False - path: # Set calibration dataset path + path: # Set calibration dataset path. n_samples: 512 bs: 1 seq_len: 512 @@ -57,7 +52,7 @@ eval: eval_pos: [pretrain, transformed, fake_quant] name: wikitext2 download: False - path: # Set evaluation dataset path + path: # Set eval dataset path. bs: 1 seq_len: 2048 quant: @@ -71,41 +66,40 @@ quant: symmetric: True granularity: per_token save: - save_vllm: True # If set to True, the real quantized integer model is saved for inference with VLLM engine - save_trans: False # If set to True, adjusted floating-point weights will be saved + save_trans: True # Set to True to save the adjusted weights. save_path: ./save ``` -For more options and details about `save`, please refer to [this section](https://llmc-en.readthedocs.io/en/latest/configs.html). +# Start to run -**LLMC** provides many [algorithm configuration files](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods) under the `configs/quantization/methods` path for reference. +Once you are prepared above, you can run the following commands +``` +PYTHONPATH=[llmc's save path]:$PYTHONPATH \ +python -m llmc \ +--config configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml +``` +Under scripts file folder, llmc also provides a lot of running [scripts](https://github.com/ModelTC/llmc/tree/main/scripts) for your reference -# Running LLMC +``` +#!/bin/bash -**LLMC** does not require installation; simply modify the `local path` of **LLMC** in the [run script](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh) as follows: +gpu_id=0 # Set the GPU id used. +export CUDA_VISIBLE_DEVICES=$gpu_id -```bash -llmc=/path/to/llmc +llmc= # Set the save path of llmc. export PYTHONPATH=$llmc:$PYTHONPATH -``` -You need to modify the configuration path in the [run script](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh) according to the algorithm you want to run. For example, `${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml` refers to the SmoothQuant quantization configuration file. `task_name` specifies the name of the `log file` generated by **LLMC** during execution. +task_name=smoothquant_llama_w8a8_fakequant_eval # Set task_name, the file name used to save the log. -```bash -task_name=smooth_w_a -config=${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml -``` - -Once you have modified the LLMC path and config path in the run script, execute it: +# Select a config to run. +nohup \ +python -m llmc \ +--config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & -```bash -bash run_llmc.sh +echo $! > ${task_name}.pid ``` -# Quantization Inference - -If you have set the option to save `real quantized` models in the configuration file, such as `save_vllm: True`, then the saved `real quantized models` can be directly used for inference with the corresponding `inference backends`. For more details, refer to the `Backend` section of the [documentation](https://llmc-en.readthedocs.io/en/latest). - # FAQ ** Q1 ** diff --git a/docs/zh_cn/source/advanced/model_test.md b/docs/zh_cn/source/advanced/model_test.md new file mode 100644 index 000000000..3db87caf4 --- /dev/null +++ b/docs/zh_cn/source/advanced/model_test.md @@ -0,0 +1,180 @@ +# 模型精度测试 + +## 精度测试流程 + +llmc支持基础的ppl(perplexity,困惑度)评测,但是更多的下游任务评测,llmc本身并不支持。 + +常见的做法使用评测工具直接对模型进行推理测试,目前已有的评测工具包括但不限于 + +1. [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) + +2. [opencompass](https://github.com/open-compass/opencompass) + +但是这种评测方法评测效率不高,我们推荐使用**推理引擎评测工具分离**的方式进行模型精度评测,模型由推理引擎进行推理,并以api的形式serving起来,评测工具对该api进行评测。这种方式有以下的好处: + +1. 使用高效的推理引擎进行模型推理,可以加速整个评测进程 + +2. 将模型的推理和模型的评测分离开,各自负责份内专业的事,代码结构更清晰 + +3. 使用推理引擎推理模型,更符合实际部署的场景,和模型实际部署的精度更容易对齐 + +我们在此推荐并介绍使用以下的模型的压缩-部署-评测流程:**llmc压缩-lightllm推理-opencompass评测** + +以下是相关工具的链接: + +1. llmc,大模型压缩工具,[[github](https://github.com/ModelTC/llmc),[文档](https://llmc-zhcn.readthedocs.io/en/latest/)] + +2. lightllm,大模型推理引擎,[[github](https://github.com/ModelTC/lightllm)] + +3. opencompass,大模型评测工具,[[github](https://github.com/open-compass/opencompass),[文档](https://opencompass.readthedocs.io/zh-cn/latest/)] + + +## lightllm推理引擎的使用 + +[lightllm](https://github.com/ModelTC/llmc)官方仓库有着更详细的文档,这里仅给出一个简单快速入门的使用文档 + + 起一个float模型的服务 + +**安装lightllm** + +``` +git clone https://github.com/ModelTC/lightllm.git +cd lightllm +pip install -v -e . +``` + +**起服务** + +``` +python -m lightllm.server.api_server --model_dir 模型路径 \ + --host 0.0.0.0 \ + --port 1030 \ + --nccl_port 2066 \ + --max_req_input_len 6144 \ + --max_req_total_len 8192 \ + --tp 2 \ + --trust_remote_code \ + --max_total_token_num 120000 +``` + +上述命令将在本机的1030端口,起一个2卡的服务 + +上述命令可以通过tp的数量设置,在tp张卡上进行TensorParallel推理,适用于较大的模型的推理。 + +上述命令中的max_total_token_num,会影响测试过程中的吞吐性能,可以根据[lightllm文档](https://github.com/ModelTC/lightllm/blob/main/docs/ApiServerArgs.md),进行设置。只要不爆显存,往往设置越大越好。 + +如果要在同一个机器上起多个lightllm服务,需要重新设定上面的port和nccl_port,不要有冲突即可。 + + + 对服务进行简单测试 + +执行下面的python脚本 + +``` +import requests +import json + +url = 'http://localhost:1030/generate' +headers = {'Content-Type': 'application/json'} +data = { + 'inputs': 'What is AI?', + "parameters": { + 'do_sample': False, + 'ignore_eos': False, + 'max_new_tokens': 128, + } +} +response = requests.post(url, headers=headers, data=json.dumps(data)) +if response.status_code == 200: + print(response.json()) +else: + print('Error:', response.status_code, response.text) +``` + +若上述脚本是有正常返回,说明服务正常 + + 起一个量化模型的服务 + +``` +python -m lightllm.server.api_server --model_dir 模型路径 \ + --host 0.0.0.0 \ + --port 1030 \ + --nccl_port 2066 \ + --max_req_input_len 6144 \ + --max_req_total_len 8192 \ + --tp 2 \ + --trust_remote_code \ + --max_total_token_num 120000 \ + --mode triton_w4a16 +``` + +上述命令加了一个`--mode triton_w4a16`,表示使用了w4a16的naive量化 + +起完服务,同样需要验证一下服务是否正常 + +上述的命令使用的模型路径是原始预训练的模型,并没有经过llmc调整。可以按照llmc的文档,打开save_trans,保存一个调整之后的模型,然后再运行上述的naive量化服务命令 + +## opencompass评测工具的使用 + +[opencompass](https://github.com/open-compass/opencompass)官方仓库有着更详细的文档,这里仅给出一个简单快速入门的使用文档 + +**安装opencompass** + +``` +git clone https://github.com/open-compass/opencompass.git +cd opencompass +pip install -v -e . +``` + +**修改配置文件** + +配置文件在[这里](https://github.com/open-compass/opencompass/blob/main/configs/eval_lightllm.py),这个配置文件是用于opencompass来评测lightllm的api服务的精度,需要注意的是里面的`url`里面的port,要和上述的lightllm的服务port保持一致 + +评测的数据集选择,需要修改这部分代码 + +``` +with read_base(): + from .summarizers.leaderboard import summarizer + from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets +``` + +上述的代码片段,表示测试humaneval数据集,更多的数据集测试支持,可以查看[这里](https://github.com/open-compass/opencompass/tree/main/configs/datasets) + +**数据集下载** + +需要根据opencompass的[文档](https://opencompass.readthedocs.io/zh-cn/latest/get_started/installation.html#id2),最好数据集的准备 + +**运行精度测试** + +修改好上述的配置文件后,即可运行下面的命令 +``` +python run.py configs/eval_lightllm.py +``` +当模型完成推理和指标计算后,我们便可获得模型的评测结果。其中会在当前目录下生成output文件夹,logs子文件夹记录着评测中的日志,最后生成summary子文件会记录所测数据集的精度 + +## 常见问题 + +** 问题1 ** + +opencompass中的数据集配置文件,同一个数据集有不同的后缀,表示的是什么意思 + +** 解决方法 ** + +不同后缀表示不同的prompt模板,详细的opencompass问题,可以查看opencompass文档 + +** 问题2 ** + +llama模型的humaneval的测试精度过低 + +** 解决方法 ** + +可能需要将opencompass提供的数据集中的humaneval的jsonl文件里面每一条末尾的\n给删除,再重新测试一下 + +** 问题3 ** + +测试速度还是不够快 + +** 解决方法 ** + +可以考虑lightllm起服务时的max_total_token_num参数设置是否合理,过小的设置,会导致测试并发偏低 + diff --git a/docs/zh_cn/source/configs.md b/docs/zh_cn/source/configs.md index 9fbf1382d..cd1b9f3d9 100644 --- a/docs/zh_cn/source/configs.md +++ b/docs/zh_cn/source/configs.md @@ -1,6 +1,6 @@ # 配置的简要说明 -所有的配置均可以在[这里](https://github.com/ModelTC/llmc/tree/main/configs)找到,具体地,包括[量化算法](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods),[量化实践以及方法组合技](https://github.com/ModelTC/llmc/tree/main/configs/quantization/combination), 以及[推理后端](https://github.com/ModelTC/llmc/tree/main/configs/quantization/backend)相关的配置 +所有的配置均可以在[这里](https://github.com/ModelTC/llmc/tree/main/configs)找到 下面的是一个简要的配置例子 @@ -8,7 +8,7 @@ base: seed: &seed 42 # 设置随机种子 model: - type: model_type # 模型的类型 + type: Llama # 模型的类型 path: model path # 模型的路径 tokenizer_mode: fast # 模型的tokenizer类型 torch_dtype: auto # 模型的dtype @@ -28,7 +28,6 @@ eval: path: eval data path # 评测数据集的路径 bs: 1 # 评测数据集的batch size seq_len: 2048 # 评测数据集的长度 - eval_token_consist: False # 是否评测量化模型和原始模型输出token的一致性 quant: method: SmoothQuant # 压缩方法 weight: @@ -39,15 +38,9 @@ quant: bit: 8 # 激活的量化bit数 symmetric: True # 激活量化是否是对称量化 granularity: per_token # 激活量化的粒度 - speical: # 量化算法需要的特殊参数,可参照每个算法的配置文件的注释以及原论文掌握其用法 save: - save_vllm: False # 是否保存真实量化的模型,以供VLLM推理 - save_sgl: False # 是否保存真实量化的模型,以供Sglang推理 - save_autoawq: False # 是否保存真实量化的模型,以供AutoAWQ推理 - save_mlcllm: False # 是否保存真实量化的模型,以供MLC-LLM推理 - save_trans: False # 是否保存权重变换之后的模型 - save_fake: False # 是否保存伪量化的权重 - save_path: /path/to/save # 保存路径 + save_trans: False # 是否保存调整之后的模型 + save_path: ./save # 保存路径 ``` # 配置的详细说明 @@ -210,11 +203,10 @@ general在[base_dataset](https://github.com/ModelTC/llmc/blob/main/llmc/data/dat ## eval -llmc默认支持评测量化模型的困惑度(PPL), 以及量化模型和原始模型输出token的一致性。此外还支持通过harness和opencompass评测下游任务的精度(可见[评测章节v1](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v1.md)和[v2](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v2.md)) eval.eval_pos -表示评测PPL的位点,目前支持三个位点可以被评测 +表示评测的位点,目前支持三个位点可以被评测 1. pretrain @@ -264,7 +256,7 @@ inference_per_block: True 同时测试多个数据集 -llmc也支持同时评测多个数据集的PPL +llmc也支持同时评测多个数据集 下面是评测单个wikitext2数据集的例子 @@ -291,9 +283,6 @@ eval: 如果直接使用llmc的[下载脚本](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py),则共有上层目录就是`--save_path`所指定的数据集保存路径 - eval.eval_token_consist - -表示是否评测量化模型和原始模型输出token的一致性,取值范围[0,1], 越接近1越说明量化模型的性能越接近原始模型 ## quant @@ -317,24 +306,11 @@ eval: 权重的量化粒度,支持以下粒度 -1. per_tensor - -2. per_channel - -3. per_group - - quant.weight.group_size - -当权重是per-group量化时,其表示group的大小 - - quant.weight.ste - -在权重量化的取整过程中,是否用直通估计器(straight-through estimator)来使round函数可以产生梯度以便进行反向传播 - - quant.weight.calib +1. per tensor -权重的校准方法,默认采用minmax,除此之外,llmc还支持learnable,mse两种方法,可能会取得更好的结果 +2. per channel +3. per group quant.act @@ -358,26 +334,15 @@ eval: 3. per head - quant.act.ste - -在激活量化的取整过程中,是否用直通估计器(straight-through estimator)来使round函数可以产生梯度以便进行反向传播 - - quant.act.calib - -激活的校准方法,默认采用minmax,且只支持minmax - -其中如果quant.method设置的为RTN,激活量化可以支持静态per tensor设置,下面是,权重静态per-channel量化,激活静态per tensor量化的配置和激活动态per token 8bit量化的配置 +其中如果quant.method设置的为RTN,激活量化可以支持静态per tensor设置,下面是一个W8A8,激活静态per tensor量化的配置 ``` quant: method: RTN - # 静态per-channel量化 weight: bit: 8 symmetric: True granularity: per_channel - - # 静态per-tensor量化 act: bit: 8 symmetric: True @@ -385,63 +350,13 @@ quant: static: True ``` -``` -quant: - method: RTN - #静态per-channel量化 - weight: - bit: 8 - symmetric: True - granularity: per_channel - - # 动态per-tensor量化 - act: - bit: 8 - symmetric: True - granularity: per_token -``` - ## save - save.save_vllm - -是否保存为[VLLM](https://github.com/vllm-project/vllm)推理后端支持的真实量化模型 - -当开启该选项时,你会发现保存的模型权重显著变小(真实量化),同时可以通过VLLM后端来直接加载推理,提高推理速度以及降低显存占用,有关于[VLLM](https://github.com/vllm-project/vllm)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html) - - save.save_sgl - -是否保存为[Sglang](https://github.com/sgl-project/sglang)推理后端支持的真实量化 - -当开启该选项时,你会发现保存的模型权重显著变小(真实量化),同时可以通过[Sglang](https://github.com/sgl-project/sglang)后端来直接加载推理,提高推理速度以及降低显存占用,有关于[Sglang](https://github.com/sgl-project/sglang)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html) - - - save.save_autoawq - -是否保存为[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)推理后端支持的真实量化模型 - -当开启该选项时,你会发现保存的模型权重显著变小(真实量化),同时可以通过[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)后端来直接加载推理,提高推理速度以及降低显存占用,有关于[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/autoawq.html) - - save.save_mlcllm - -是否保存为[MLC-LLM](https://github.com/mlc-ai/mlc-llm)推理后端支持的真实量化模型 - -当开启该选项时,你会发现保存的模型权重显著变小(真实量化),同时可以通过[MLC-LLM](https://github.com/mlc-ai/mlc-llm)后端来直接加载推理,提高推理速度以及降低显存占用,有关于[MLC-LLM](https://github.com/mlc-ai/mlc-llm)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/mlcllm.html) - - save.save_trans 是否保存调整之后的模型权重 -保存的该权重,是经过调整之后的更适合量化的权重,其可能包含更少的离群值,其还是以fp16/bf16的格式保存(权重文件大小与原始模型保持一致),在推理引擎中部署的时候,需要开启推理引擎的`naive量化`功能,即可实现量化推理。 - -与`save_vllm`等不同的是,其需要该推理引擎来完成真实量化,而`llmc`提供一个更适合量化的浮点模型权重。 - -例如`SmoothQuant/Os+/AWQ/Quarot`等算法导出的`save_trans`模型,其具有`更少的outliers`,更适合量化。 - - save.save_fake - -是否保存伪量化的模型 +保存的该权重,是经过调整之后的更适合量化的权重,它还是以fp16形式保存,在推理引擎中部署的时候,需要开启naive量化,即可实现量化推理 save.save_path diff --git a/docs/zh_cn/source/index.rst b/docs/zh_cn/source/index.rst index e07a1de84..a2e66ce60 100644 --- a/docs/zh_cn/source/index.rst +++ b/docs/zh_cn/source/index.rst @@ -30,25 +30,8 @@ arxiv链接: https://arxiv.org/abs/2405.06001 :maxdepth: 2 :caption: 进阶用法 - advanced/model_test_v1.md - advanced/model_test_v2.md + advanced/model_test.md advanced/custom_dataset.md advanced/mix_bits.md advanced/sparsification.md -.. toctree:: - :maxdepth: 2 - :caption: 量化最佳实践 - - practice/awq.md - practice/awq_omni.md - practice/quarot_gptq.md - -.. toctree:: - :maxdepth: 2 - :caption: 量化推理后端 - - backend/vllm.md - backend/sglang.md - backend/autoawq.md - backend/mlcllm.md \ No newline at end of file diff --git a/docs/zh_cn/source/quickstart.md b/docs/zh_cn/source/quickstart.md index 6d4854d12..34811d21d 100644 --- a/docs/zh_cn/source/quickstart.md +++ b/docs/zh_cn/source/quickstart.md @@ -1,14 +1,18 @@ -# LLMC的安装 +# llmc的安装 ``` git clone https://github.com/ModelTC/llmc.git -cd llmc/ pip install -r requirements.txt ``` +llmc无需安装,使用llmc只需在脚本中添加 +``` +PYTHONPATH=llmc的下载路径:$PYTHONPATH +``` + # 准备模型 -**LLMC**目前仅支持`hugging face`格式的模型。以`Qwen2-0.5B`为例,可以在[这里](https://huggingface.co/Qwen/Qwen2-0.5B)找到模型。下载方式可以参考[这里](https://zhuanlan.zhihu.com/p/663712983) +llmc目前仅支持hugging face格式的模型。以Qwen2-0.5B为例,可以在[这里](https://huggingface.co/Qwen/Qwen2-0.5B)找到模型。下载方式可以参考[这里](https://zhuanlan.zhihu.com/p/663712983) 大陆地区用户还可以使用[hugging face镜像](https://hf-mirror.com/) @@ -21,22 +25,15 @@ HF_ENDPOINT=https://hf-mirror.com HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli do # 下载数据集 -**LLMC**需要的数据集可以分为`校准数据集`和`测试数据集`。`校准数据集`可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py)下载,`测试数据`集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py)下载 +llmc需要的数据集可以分为校准数据集和测试数据集。校准数据集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py)下载,测试数据集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py)下载 -当然**LLMC**也支持在线下载数据集,只需要在`config`中的`download`设置为True即可。 +当然llmc也支持在线下载数据集,只需要在config中的download设置为True即可。 -```yaml -calib: - name: pileval - download: True -``` - -# 设置配置文件 +# 设置config -所有的`配置文件`都在[这里](https://github.com/ModelTC/llmc/blob/main/configs/)可以找到,同时关于`配置文件`的说明请参考[此章节](https://llmc-zhcn.readthedocs.io/en/latest/configs.html) -以SmoothQuant为例,`config`在[这里](https://github.com/ModelTC/llmc/blob/main/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml) +以smoothquant为例,config在[这里](https://github.com/ModelTC/llmc/blob/main/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml) -```yaml +``` base: seed: &seed 42 model: @@ -70,39 +67,39 @@ quant: symmetric: True granularity: per_token save: - save_vllm: True # 当设置为True时,可以保存真实量化的整型模型,并通过VLLM推理引擎进行推理 - save_trans: False # 当设置为True,可以保存下调整之后的浮点权重 + save_trans: True # 设置为True,可以保存下调整之后的权重 save_path: ./save ``` -有关于`save`的更多选项和说明,请参照[此章节](https://llmc-zhcn.readthedocs.io/en/latest/configs.html) - - -**LLMC**在`configs/quantization/methods`路径下,提供了很多的[算法配置文件](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods)供大家参考。 # 开始运行 -**LLMC**无需安装,只需在[运行脚本](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh)中将`/path/to/llmc`修改为**LLMC**的`本地路径`即可。 -```bash -llmc=/path/to/llmc -export PYTHONPATH=$llmc:$PYTHONPATH +做好上面的准备之后,可以通过以下的命令运行 ``` +PYTHONPATH=llmc的下载路径:$PYTHONPATH \ +python -m llmc \ +--config configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml +``` +llmc在scripts下,也提供了很多的运行[脚本](https://github.com/ModelTC/llmc/tree/main/scripts)供大家参考 -根据你想运行的算法,需相应修改[运行脚本](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh)中的配置路径。例如,`${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml`对应的是 SmoothQuant 量化的配置文件。`task_name`用于指定**LLMC**运行时生成的`日志文件名称`。 - -```bash -task_name=smooth_w_a -config=${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml ``` +#!/bin/bash -当在运行脚本中,修改完相应的LLMC路径和config路径后,运行即可: +gpu_id=0 # 设置使用的GPU id +export CUDA_VISIBLE_DEVICES=$gpu_id -```bash -bash run_llmc.sh -``` +llmc= # 设置llmc的下载路径 +export PYTHONPATH=$llmc:$PYTHONPATH -# 量化推理 +task_name=smoothquant_llama_w8a8_fakequant_eval # 设置task_name,用于保存log的文件名 -假设你在配置文件中指定了保存`真实量化`模型的选项,例如 `save_vllm: True`,那么保存的`真实量化模型`即可直接用于对应的`推理后端`执行,具体可参照[文档](https://llmc-zhcn.readthedocs.io/en/latest)的`量化推理后端`章节。 +# 选择某个config运行 +nohup \ +python -m llmc \ +--config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid +``` # 常见问题 diff --git a/examples/backend/autoawq/infer_with_autoawq.py b/examples/backend/autoawq/infer_with_autoawq.py deleted file mode 100644 index a157f0620..000000000 --- a/examples/backend/autoawq/infer_with_autoawq.py +++ /dev/null @@ -1,34 +0,0 @@ - - -import sys - -autoawq_path = '/path/to/AutoAWQ' -sys.path.append(autoawq_path) - -import torch -from awq import AutoAWQForCausalLM -from transformers import AutoTokenizer, TextStreamer - -model_path = '/path/to/save_for_autoawq_awq_w4/autoawq_quant_model' - -tokenizer = AutoTokenizer.from_pretrained(model_path) -streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) - -model = AutoAWQForCausalLM.from_quantized( - model_path, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - device_map='auto', -) - - -prompt_text = 'The president of the United States is ' -inputs = tokenizer(prompt_text, return_tensors='pt').to('cuda') - -outputs = model.generate( - **inputs, - do_sample=False, - max_new_tokens=100, - streamer=streamer, - eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids('<|eot_id|>')] -) diff --git a/examples/backend/mlcllm/infer_with_mlcllm.py b/examples/backend/mlcllm/infer_with_mlcllm.py deleted file mode 100644 index be9523aa7..000000000 --- a/examples/backend/mlcllm/infer_with_mlcllm.py +++ /dev/null @@ -1,17 +0,0 @@ -from mlc_llm import MLCEngine - -# Create engine -model_path = './dist/llama2-7b-chat-MLC/' -engine = MLCEngine(model_path) - -# Run chat completion in OpenAI API. -for response in engine.chat.completions.create( - messages=[{'role': 'user', 'content': 'What is the meaning of life?'}], - model=model_path, - stream=True, -): - for choice in response.choices: - print(choice.delta.content, end='', flush=True) -print('\n') - -engine.terminate() diff --git a/examples/backend/sglang/infer_with_sglang.py b/examples/backend/sglang/infer_with_sglang.py deleted file mode 100644 index 2a92b807c..000000000 --- a/examples/backend/sglang/infer_with_sglang.py +++ /dev/null @@ -1,13 +0,0 @@ -import openai - -client = openai.Client( - base_url='http://127.0.0.1:10000/v1', api_key='EMPTY') - -# Text completion -response = client.completions.create( - model='default', - prompt='The president of the United States is', - temperature=0, - max_tokens=32, -) -print(response) diff --git a/examples/backend/vllm/infer_with_vllm.py b/examples/backend/vllm/infer_with_vllm.py deleted file mode 100644 index 8b77349d1..000000000 --- a/examples/backend/vllm/infer_with_vllm.py +++ /dev/null @@ -1,21 +0,0 @@ -from transformers import AutoTokenizer -from vllm import LLM, SamplingParams - -model_path = '/path/to/save_for_vllm_awq_w4/real_quant_model' -model = LLM(model_path) -tokenizer = AutoTokenizer.from_pretrained(model_path) - -prompts = [ - 'Hello, my name is', - 'The president of the United States is', - 'The capital of France is', - 'The future of AI is', -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -outputs = model.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}') diff --git a/llmc/__main__.py b/llmc/__main__.py index 08a03ab4e..62f43f670 100644 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -6,19 +6,19 @@ import time import torch +import transformers import yaml from easydict import EasyDict from loguru import logger -from torch.distributed import destroy_process_group, init_process_group +from transformers import (LlamaTokenizerFast, Trainer, TrainingArguments, + default_data_collator) from llmc.compression.quantization import * from llmc.compression.sparsification import * -from llmc.data import BaseDataset, BaseTokenizer -from llmc.eval import AccuracyEval, PerplexityEval, TokenConsistencyEval +from llmc.data import BaseDataset, BaseTokenizer, TrainJsonDataset +from llmc.eval import PerplexityEval from llmc.models import * -from llmc.utils import (check_config, mkdirs, print_important_package_version, - seed_all, update_autoawq_quant_config, - update_vllm_quant_config) +from llmc.utils import check_config, mkdirs, seed_all from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY @@ -43,66 +43,99 @@ def main(config): eval_config.name = name if len(name_list) != 1: # eval multi datasets eval_config.path = os.path.join(config.eval.path, name) - if config.eval.type == 'acc': - acc_eval = AccuracyEval(eval_config) - eval_list.append(acc_eval) - else: - ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config) - eval_list.append(ppl_eval) + ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config) + eval_list.append(ppl_eval) if 'eval' in config and 'pretrain' in config.eval.eval_pos: - if config.eval.type == 'acc': - for acc_eval in eval_list: - acc = acc_eval.eval(model) - logger.info(f'{config.eval.name} acc : {acc}') - else: - for ppl_eval in eval_list: - ppl = ppl_eval.eval(model) - logger.info(f'{ppl_eval.dataset} ppl : {ppl}') + for ppl_eval in eval_list: + ppl = ppl_eval.eval(model) + logger.info(f'{ppl_eval.dataset} ppl : {ppl}') if not config.get('calib', False): blockwise_opt = ALGO_REGISTRY[config.quant.method]( - model, - quant_config=config.quant, - input=None, - padding_mask=None, - config=config + model, quant_config=config.quant, input=None, config=config ) blockwise_opt.run_block_loop() else: - dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib, model.processor) - calib_data, padding_mask = dataset.get_calib_dataset() - model.collect_first_block_input(calib_data, config.calib.type) + dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib) + calib_data = dataset.get_calib_dataset() + model.collect_first_block_input(calib_data) del calib_data gc.collect() torch.cuda.empty_cache() if not config.get('sparse', False): blockwise_opt = ALGO_REGISTRY[config.quant.method]( - model, - config.quant, - model.get_first_block_input(), - padding_mask, - config + model, config.quant, model.get_first_block_input(), config ) else: blockwise_opt = ALGO_REGISTRY[config.sparse.method]( - model, - config.sparse, - model.get_first_block_input(), - padding_mask, - config + model, config.sparse, model.get_first_block_input(), config ) blockwise_opt.run_block_loop() + + if 'train' in config: + + blockwise_opt.deploy('train_rotate_quant') + + dataset = BaseDataset(tokenizer.get_tokenizer(), config.train.data) + + train_tokenizer = LlamaTokenizerFast.from_pretrained( + pretrained_model_name_or_path=config.model.path, + cache_dir=config.train.data.cache_dir, + model_max_length=config.train.data.seq_len, + padding_side='right', + use_fast=True, + add_eos_token=False, + add_bos_token=False, + ) + + + if 'eval' in config and len(config.eval.eval_pos): + eval_list = [] + name_list = ( + config.eval.name + if not isinstance(config.eval.name, str) + else [config.eval.name] + ) + for name in name_list: + eval_config = copy.deepcopy(config.eval) + eval_config.name = name + if len(name_list) != 1: # eval multi datasets + eval_config.path = os.path.join(config.eval.path, name) + ppl_eval = PerplexityEval(train_tokenizer, eval_config) + eval_list.append(ppl_eval) + + train_data = TrainJsonDataset( + dataset.calib_dataset, + train_tokenizer, + block_size=config.train.data.seq_len, + ) + + train_args = TrainingArguments(**config.train.train_args) + trainable_parameters = blockwise_opt.get_trainable_params() + blockwise_opt.model.model.seqlen = config.train.data.seq_len + optimizer = SGDG(trainable_parameters, lr=config.train.train_args.learning_rate, stiefel=True) + + trainer = Trainer( + model=blockwise_opt.model.model, + tokenizer=train_tokenizer, + args=train_args, + train_dataset=train_data, + eval_dataset=None, + data_collator=default_data_collator, + optimizers=(optimizer, None), + ) + + trainer.train() + + logger.info('End training') + + if 'eval' in config and 'transformed' in config.eval.eval_pos: blockwise_opt.deploy('origin_float') - if config.eval.type == 'acc': - for acc_eval in eval_list: - acc = acc_eval.eval(model) - logger.info(f'{config.eval.name} acc : {acc}') - else: - for ppl_eval in eval_list: - ppl = ppl_eval.eval(model) - logger.info(f'{ppl_eval.dataset} ppl : {ppl}') + for ppl_eval in eval_list: + ppl = ppl_eval.eval(model) + logger.info(f'{ppl_eval.dataset} ppl : {ppl}') if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) @@ -119,123 +152,35 @@ def main(config): if 'eval' in config and 'fake_quant' in config.eval.eval_pos: blockwise_opt.deploy('fake_quant') - if config.eval.type == 'acc': - for acc_eval in eval_list: - acc = acc_eval.eval(model) - logger.info(f'{config.eval.name} acc : {acc}') - else: - for ppl_eval in eval_list: - ppl = ppl_eval.eval(model) - logger.info(f'{ppl_eval.dataset} ppl : {ppl}') - - if 'eval_token_consist' in config.eval and config.eval.eval_token_consist: - org_model = MODEL_REGISTRY[config.model.type]( - config.model.path, config.model.torch_dtype - ) - token_consist_eval = TokenConsistencyEval(tokenizer.get_tokenizer(), - eval_config) - consistency_ratio = token_consist_eval.eval(model, org_model) - logger.info(f'Token consistency ratio: {consistency_ratio}') - del org_model + for ppl_eval in eval_list: + ppl = ppl_eval.eval(model) + logger.info(f'{ppl_eval.dataset} ppl : {ppl}') if 'save' in config and config.save.get('save_fake', False): blockwise_opt.deploy('fake_quant') blockwise_opt.save_model(save_fake_path) - if 'save' in config and config.save.get('save_vllm', False): - w, a = config.quant.weight, config.quant.get('act') - if isinstance(w.bit, str): - assert a, 'Only WA float quant is supported.' - assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.' - assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \ - a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported' - else: - assert w.symmetric, 'Only symmetric quant is supported.' - assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.' - if a: - assert a.symmetric, 'Only symmetric quant is supported.' - assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.' - blockwise_opt.deploy('vllm_quant') - blockwise_opt.save_model(save_quant_path) - update_vllm_quant_config(blockwise_opt.model, config, save_quant_path) - - if 'save' in config and config.save.get('save_sgl', False): - w, a = config.quant.weight, config.quant.get('act') - if isinstance(w.bit, str): - assert a, 'Only WA float quant is supported.' - assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.' - assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \ - a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported' - else: - assert w.symmetric, 'Only symmetric quant is supported.' - assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.' - if a: - assert a.symmetric, 'Only symmetric quant is supported.' - assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.' - blockwise_opt.deploy('sgl_quant') - blockwise_opt.save_model(save_quant_path) - update_vllm_quant_config(blockwise_opt.model, config, save_quant_path) - - if 'save' in config and config.save.get('save_autoawq', False): - assert config.quant.weight.bit in [4] and 'act' not in config.quant, \ - 'AutoAWQ supports only 4-bit weight-only quantization.' - assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.' - - blockwise_opt.deploy('autoawq_quant') - blockwise_opt.save_model(save_quant_path) - update_autoawq_quant_config(config, save_quant_path) - - if 'save' in config and config.save.get('save_mlcllm', False): - assert config.quant.weight.bit in [4] and 'act' not in config.quant, \ - 'MlcLLM supports only 4-bit weight-only quantization.' - assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.' - - blockwise_opt.deploy('mlcllm_quant') + if 'save' in config and config.save.get('save_lightllm', False): + blockwise_opt.deploy('real_quant') blockwise_opt.save_model(save_quant_path) - update_autoawq_quant_config(config, save_quant_path) - - if 'opencompass' in config: - assert config.save.get('save_trans', False) - cfg_path = config['opencompass']['cfg_path'] - output_path = config['opencompass']['output_path'] - eval_model_path = os.path.abspath(save_trans_path) - opencompass_cmd = ( - f'opencompass {cfg_path} -w {output_path} ' - f'--llmc_cfg {args.config} ' - f'--llmc_eval_mode quant ' - f'--llmc_model_path {eval_model_path}' - ) - logger.info(f'opencompass_cmd : {opencompass_cmd}') - os.system(opencompass_cmd) if __name__ == '__main__': llmc_start_time = time.time() parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, required=True) - parser.add_argument('--task_id', type=str, required=True) args = parser.parse_args() with open(args.config, 'r') as file: config = yaml.safe_load(file) config = EasyDict(config) - init_process_group(backend='nccl') - torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) - - if int(os.environ['RANK']) != 0: - logger.remove() - check_config(config) logger.info(f'args: {args}') logger.info(f'config:\n{json.dumps(config, ensure_ascii=False, indent=4)}') - print_important_package_version() - - logger.info(f'WORLD_SIZE : {int(os.environ["WORLD_SIZE"])}') - - seed_all(config.base.seed + int(os.environ['RANK'])) + seed_all(config.base.seed) # mkdirs if 'save' in config: @@ -251,17 +196,8 @@ def main(config): config.save.save_path, 'trtllm_engine' ) mkdirs(save_trtllm_engine_path) - if config.save.get('save_vllm', False): - save_quant_path = os.path.join(config.save.save_path, 'vllm_quant_model') - mkdirs(save_quant_path) - if config.save.get('save_sgl', False): - save_quant_path = os.path.join(config.save.save_path, 'sgl_quant_model') - mkdirs(save_quant_path) - if config.save.get('save_autoawq', False): - save_quant_path = os.path.join(config.save.save_path, 'autoawq_quant_model') - mkdirs(save_quant_path) - if config.save.get('save_mlcllm', False): - save_quant_path = os.path.join(config.save.save_path, 'mlcllm_quant_model') + if config.save.get('save_lightllm', False): + save_quant_path = os.path.join(config.save.save_path, 'real_quant_model') mkdirs(save_quant_path) if config.save.get('save_fake', False): save_fake_path = os.path.join(config.save.save_path, 'fake_quant_model') @@ -269,8 +205,6 @@ def main(config): main(config) - destroy_process_group() - llmc_end_time = time.time() llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') diff --git a/llmc/compression/blockwise_optimization.py b/llmc/compression/blockwise_optimization.py index 0dc7ccbfa..d8d844c7c 100644 --- a/llmc/compression/blockwise_optimization.py +++ b/llmc/compression/blockwise_optimization.py @@ -5,13 +5,12 @@ class BlockwiseOpt(metaclass=ABCMeta): - def __init__(self, model, quant_config, input, padding_mask, config): + def __init__(self, model, quant_config, input, config): self.model = model self.blocks = model.get_blocks() self.quant_config = quant_config self.sparsity_config = quant_config self.input = input - self.padding_mask = padding_mask self.data_free = False if self.input else True self.config = config self.block_idx = None @@ -20,9 +19,6 @@ def __init__(self, model, quant_config, input, padding_mask, config): for i in range(len(input['kwargs'])): if 'use_cache' in input['kwargs'][i]: input['kwargs'][i].pop('use_cache') - for i in range(len(input['kwargs'])): - if 'past_key_value' in input['kwargs'][i]: - input['kwargs'][i]['past_key_value'] = None self.n_samples = 0 for i in range(len(input['data'])): self.n_samples += input['data'][i].shape[0] diff --git a/llmc/compression/quantization/__init__.py b/llmc/compression/quantization/__init__.py index a57973ace..356d51d80 100644 --- a/llmc/compression/quantization/__init__.py +++ b/llmc/compression/quantization/__init__.py @@ -9,9 +9,11 @@ from .ntweak import NormTweaking from .omniq import OmniQuant from .osplus import OsPlus -from .quant import FloatQuantizer, IntegerQuantizer +from .quant import Quantizer from .quarot import Quarot from .quik import QUIK from .rtn import RTN from .smoothquant import SmoothQuant +from .spinquant import SpinQuant from .spqr import SpQR +from .train_utils import SGDG diff --git a/llmc/compression/quantization/awq.py b/llmc/compression/quantization/awq.py index 99b28709b..8a2b291e6 100644 --- a/llmc/compression/quantization/awq.py +++ b/llmc/compression/quantization/awq.py @@ -1,8 +1,6 @@ import gc -import os import torch -import torch.distributed as dist import torch.nn as nn from loguru import logger @@ -17,8 +15,8 @@ @ALGO_REGISTRY class Awq(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) special_config = self.quant_config.get('special', {}) self.trans = special_config.get('trans', True) self.trans_version = special_config.get('trans_version', 'v2') @@ -40,10 +38,7 @@ def get_weight_scale(self, layers_dict): ) weights = wquantizer.reshape_tensor(weights) scale = weights.abs() / weights.abs().amax(dim=1, keepdim=True) - try: - scale = scale.view(org_shape) - except RuntimeError: - scale = wquantizer.restore_tensor(scale, org_shape) + scale = scale.view(org_shape) scale = scale.mean(0) del weights gc.collect() @@ -126,15 +121,12 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs) self.quantizer_mix_bits, self.aquantizer, ).fake_quant_act_dynamic(x_tmp) + out = inspect_module(x_tmp, **kwargs) if isinstance(out, tuple): out = out[0] - if self.padding_mask: - org_out = org_out * self.padding_mask[i].unsqueeze(dim=-1).to(org_out.device) # noqa - out = out * self.padding_mask[i].unsqueeze(dim=-1).to(out.device) - loss = (org_out - out).float().pow(2).mean().item() loss_mean += x.shape[0] * 1.0 / self.n_samples * loss scales_mean += x.shape[0] * 1.0 / self.n_samples * scales @@ -144,8 +136,6 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs) best_error = loss_mean best_scales = scales_mean best_scales = best_scales.view(-1) - dist.all_reduce(best_scales, op=dist.ReduceOp.SUM) - best_scales /= int(os.environ['WORLD_SIZE']) del org_out_dict gc.collect() torch.cuda.empty_cache() @@ -166,11 +156,7 @@ def block_transform(self, block, input_feat, block_kwargs): if self.weight_clip: logger.info('auto_clip start') logger.info(f'clip version: {self.clip_version}') - self.auto_clip( - block, - input_feat, - n_sample_token=self.config.calib.get('seq_len', None) - ) + self.auto_clip(block, input_feat, n_sample_token=self.config.calib.seq_len) logger.info('auto_clip finished') else: logger.info('disable weight clip') diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 09ad2b66a..242883491 100644 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -1,12 +1,10 @@ import functools import gc import json -import os from collections import defaultdict from functools import partial import torch -import torch.distributed as dist import torch.nn as nn from loguru import logger @@ -15,16 +13,18 @@ from ..blockwise_optimization import BlockwiseOpt from .hadamard_utils import apply_exact_had_to_linear, get_hadK from .module_utils import (_LLMC_LINEAR_TYPES_, _LLMC_LN_TYPES_, - _REALQUANT_LINEAR_MAP_, _TRANSFORMERS_LINEAR_TYPES_, + _TRANSFORMERS_LINEAR_TYPES_, _TRANSFORMERS_LN_TYPES_, EffcientFakeQuantLinear, - FakeQuantLinear, OriginFloatLinear, RotateLinear) -from .quant import FloatQuantizer, IntegerQuantizer + FakeQuantLinear, OriginFloatLinear, RealQuantLinear, + RotateLinear) +from .quant import Quantizer +from .rotate_utils import ActRotater, WeightRotater from .utils import check_do_quant, check_w_only, get_aquantizer, get_wquantizer class BaseBlockwiseQuantization(BlockwiseOpt): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.set_quant_config() def w_qdq(self, module, wquantizer): @@ -45,7 +45,14 @@ def a_qdq(self, act, module, aquantizer): def logit(self, x): return torch.log(x / (1 - x)) - def get_replacement_params(self, mode='fake_quant', w_only=False, name=None): + def random_orthogonal_matrix(self, hidden_size, dev): + torch.cuda.empty_cache() + random_matrix = torch.randn(size, size, dtype=torch.float64).to(device) + q, r = torch.linalg.qr(random_matrix) + q *= torch.sign(torch.diag(r)).unsqueeze(0) + return q + + def get_replacement_params(self, mode='fake_quant', w_only=False, name=None, args={}): params_dict = {} if mode == 'fake_quant': if not self.mix_bits: @@ -65,37 +72,47 @@ def get_replacement_params(self, mode='fake_quant', w_only=False, name=None): params_dict['aquantizer_default'] = self.aquantizer params_dict['w_only_default'] = w_only - elif mode in _REALQUANT_LINEAR_MAP_.keys(): + elif mode == 'real_quant': params_dict['w_q'] = partial(self.w_q, wquantizer=self.wquantizer) params_dict['quant_config'] = self.quant_config - elif mode == 'online_rotate': + elif mode == 'rotate': + params_dict['w_rot'], params_dict['a_rot'] = None, None + if hasattr(self, 'weight_rotate') and self.weight_rotate: + params_dict['w_rot'] = partial(self.w_rot, w_rotater=self.w_rotater, args=args) + + if hasattr(self, 'online_rotate') and self.online_rotate: + if hasattr(self, 'weight_rotate') and self.weight_rotate: + if name is None or not 'down_proj' in name: + return params_dict + else: + if name is None or not ('down_proj' in name): + return params_dict + + had_K, K = get_hadK( + self.intermediate_size if 'down_proj' in name else self.num_heads + ) + a_rotater = ActRotater( + online_full_had=True if 'down_proj' in name else False, + online_partial_had=True if 'o_proj' in name else False, + fp32_had=self.fp32_had, + K=K, + had_K=had_K, + had_dim=None if 'down_proj' in name else self.hidden_size // self.num_heads, + ) + params_dict['a_rot'] = partial(self.a_rot, a_rotater=a_rotater) - had_K, K = get_hadK( - self.intermediate_size if 'down_proj' in name else self.num_heads - ) - params_dict = { - 'had_K': had_K, - 'K': K, - 'online_full_had': 'down_proj' in name, - 'online_partial_had': 'o_proj' in name, - 'had_dim': ( - None if 'down_proj' in name else self.hidden_size // self.num_heads - ), - 'fp32_had': self.fp32_had, - } return params_dict def alloc_bits(self, mix_bits_settings): - for i in range(len(mix_bits_settings)): mix_bits_setting = mix_bits_settings[f'setting_{i}'] if mix_bits_setting['do_quant']: - wquantizer_mix_bits = self.quant_module(**mix_bits_setting['weight']) + wquantizer_mix_bits = Quantizer(**mix_bits_setting['weight']) if 'act' in mix_bits_setting: w_only_mix_bits = False - aquantizer_mix_bits = self.quant_module(**mix_bits_setting['act']) + aquantizer_mix_bits = Quantizer(**mix_bits_setting['act']) else: w_only_mix_bits = True self.quantizer_mix_bits.append( @@ -145,25 +162,14 @@ def set_quant_config(self): self.quantizer_mix_bits = [] self.quant_out = self.quant_config.get('quant_out', False) - self.tp = self.quant_config.get('tp', 1) - self.quant_config['weight']['tp'] = self.tp - - # select quant module - self.quant_type = self.quant_config.get('quant_type', 'int_quant') - if self.quant_type == 'int_quant': - self.quant_module = IntegerQuantizer - else: - self.quant_module = FloatQuantizer - logger.info(f'The used Quant Module is {self.quant_module}') # set weight quant config - self.wquantizer = self.quant_module(**self.quant_config['weight']) + self.wquantizer = Quantizer(**self.quant_config['weight']) # set act quant config if 'act' in self.quant_config: self.w_only = False - self.quant_config['act']['tp'] = self.tp - self.aquantizer = self.quant_module(**self.quant_config['act']) + self.aquantizer = Quantizer(**self.quant_config['act']) else: self.w_only = True self.aquantizer = None @@ -210,28 +216,11 @@ def set_quant_config(self): assert self.config['model']['type'] in ['Opt', 'Llama'] self.hidden_size = self.model.model_config.hidden_size - if self.online_rotate: - self.num_heads = self.model.model_config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.intermediate_size = self.model.model_config.intermediate_size - self.fp32_had = special_config.get('fp32_had', False) - - def replace_rotate_linears(self, block): - for n, m in block.named_modules(): - if isinstance(m, nn.Linear) and ('down_proj' in n - or 'o_proj' in n - or 'fc2' in n - or 'out_proj' in n): - subset = {'layers': {n: m}} - self.model.replace_module_subset( - RotateLinear, - block, - subset, - None, - self.get_replacement_params( - mode='online_rotate', w_only=self.w_only, name=n - ), - ) + # if self.online_rotate: + self.num_heads = self.model.model_config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.intermediate_size = self.model.model_config.intermediate_size + self.fp32_had = special_config.get('fp32_had', False) def block_forward(self, block, input_data=None): output = [] @@ -256,19 +245,15 @@ def block_forward(self, block, input_data=None): def block_opt(self, block): block = block.cuda() named_linears = self.model.get_block_linears(block) - extra_modules = self.model.get_extra_modules(block) - input_feat_modules = { - k: v for d in [named_linears, extra_modules] for k, v in d.items() - } - logger.info(f'input_feat_modules: {input_feat_modules}') + logger.info(f'named_linears: {named_linears}') input_feat = defaultdict(list) handles = [] self.block_init(block) if not self.data_free: - for name in input_feat_modules: + for name in named_linears: handles.append( - input_feat_modules[name].register_forward_hook( + named_linears[name].register_forward_hook( functools.partial( self.cache_input_hook, name=name, feat_dict=input_feat ) @@ -334,7 +319,7 @@ def block_init(self, block): def filter_subset(self, subset): return True - def collect_layers_weights(self, layers, tensor_parallelize_style=None): + def collect_layers_weights(self, layers): weights = [] for _m in layers: weights.append(_m.weight) @@ -379,7 +364,7 @@ def apply_shift(self, shifts, prev_op, layers): def scale_fc_fc(self, fc1, fc2, scales): scales = scales.to(fc1.weight.device) if fc1.out_features == fc2.in_features * 3: - num_heads = self.model.get_num_attention_heads() + num_heads = self.model.get_model_config().to_dict().get('n_head', None) fc1.weight.t_() org_shape = fc1.weight.shape fc1.weight.data = fc1.weight.data.reshape(org_shape[0] * num_heads, 3, -1) @@ -461,7 +446,7 @@ def scale_ln_fcs(self, ln, fcs, scales): scales = scales.to(ln.weight.device) ln.weight.div_(scales) - if hasattr(ln, 'bias') and ln.bias is not None: + if self.model.has_bias(): ln.bias.div_(scales) for fc in fcs: @@ -506,12 +491,6 @@ def auto_clip(self, block, input_feat, n_sample_token): n_sample_token=n_sample_token, ) - dist.all_reduce(max_val, op=dist.ReduceOp.SUM) - max_val /= int(os.environ['WORLD_SIZE']) - - dist.all_reduce(min_val, op=dist.ReduceOp.SUM) - min_val /= int(os.environ['WORLD_SIZE']) - self.apply_clip(m, min_val, max_val, n) @torch.no_grad() @@ -519,20 +498,12 @@ def apply_clip(self, layer, min_val, max_val, layer_name): if self.clip_version == 'v1': max_val = max_val.to(layer.weight.device) org_shape = layer.weight.shape - try: - layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) - except RuntimeError: - layer.weight.data = self.wquantizer.reshape_tensor(layer.weight.data) - layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) + layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) if self.clip_sym: min_val = -max_val layer.weight.data = torch.clamp(layer.weight.data, min_val, max_val) - try: - layer.weight.data = layer.weight.data.reshape(org_shape) - except RuntimeError: - layer.weight.data = self.wquantizer \ - .restore_tensor(layer.weight.data, org_shape) + layer.weight.data = layer.weight.data.reshape(org_shape) elif self.clip_version == 'v2': up_factor, low_factor = self.get_clip_factor( layer, min_val, max_val, layer_name @@ -607,11 +578,7 @@ def auto_clip_layer( else: group_size = w.shape[1] - try: - w = w.reshape(w.shape[0], 1, -1, group_size) - except RuntimeError: - w = self.wquantizer.reshape_tensor(w) - w = w.reshape(w.shape[0], 1, -1, group_size) + w = w.reshape(w.shape[0], 1, -1, group_size) oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64 # prevent OOM assert w.shape[0] % oc_batch_size == 0 @@ -620,7 +587,7 @@ def auto_clip_layer( best_min_val_all = [] for i_b in range(w.shape[0] // oc_batch_size): - w = w_all[i_b * oc_batch_size: (i_b + 1) * oc_batch_size] + w = w_all[i_b * oc_batch_size : (i_b + 1) * oc_batch_size] if self.clip_sym: org_max_val = w.abs().amax(dim=-1, keepdim=True) @@ -648,17 +615,8 @@ def auto_clip_layer( input[i] = input[i].to(w.device) x = input[i] x = x.view(-1, x.shape[-1]) - if self.padding_mask: - mask_tmp = self.padding_mask[i].flatten() - x = x[mask_tmp.bool()] - try: - x = x.reshape(1, x.shape[0], -1, group_size) - except RuntimeError: - x = self.wquantizer.reshape_tensor(x) - x = x.reshape(1, x.shape[0], -1, group_size) - if n_sample_token is None: - n_sample_token = min(x.shape[1], 512) - x = x[:, 0:: x.shape[1] // n_sample_token] + x = x.reshape(1, x.shape[0], -1, group_size) + x = x[:, 0 :: x.shape[1] // n_sample_token] if i in org_out_dict: org_out = org_out_dict[i] else: @@ -682,14 +640,14 @@ def auto_clip_layer( w, low_factor, up_factor ) - scales, zeros, qmax, qmin = wquantizer.get_qparams( + scales, zeros, max_int, min_int = wquantizer.get_qparams( tensor_range, w.device ) args = {} args['scales'] = scales args['zeros'] = zeros - args['qmax'] = qmax - args['qmin'] = qmin + args['max_int'] = max_int + args['min_int'] = min_int q_w = wquantizer.fake_quant_weight_static(w, args) else: raise Exception('Not support other clip version') @@ -740,41 +698,87 @@ def auto_clip_layer( torch.cuda.empty_cache() return best_max_val.squeeze(1), best_min_val.squeeze(1) + def replace_rotate_fc(self, block, n, m, Q1=None, Q2=None, transpose=False): + args = {} + if hasattr(self, 'weight_rotate') and self.weight_rotate: + args['Q1'] = Q1 + args['Q2'] = Q2 + args['transpose'] = transpose + + params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=n, args=args) + if params_dict == {}: + return + + subset = {'layers': {n: m}} + self.model.replace_module_subset( + RotateLinear, + block, + subset, + self.block_idx, + params_dict + ) + + def replace_rotate_fcs(self, block): + for n, m in block.named_modules(): + if isinstance(m, nn.Linear): + self.replace_rotate_fc(block, n, m) + + def rotate_weight(self, weight, bias, Q, transpose): + dtype = weight.dtype + dev = weight.data.device + R_b = bias + + W = weight.data.to(device=dev, dtype=torch.float64) + Q = Q.to(device=dev, dtype=torch.float64) + if not transpose: + R_W = torch.matmul(W, Q).to(device='cpu', dtype=dtype) + else: + R_W = torch.matmul(Q.T, W).to(device='cpu', dtype=dtype) + if bias is not None: + b = bias.data.to(device=dev, dtype=torch.float64) + R_b = torch.matmul(Q.T, b).to(device='cpu', dtype=dtype) + + return R_W, R_b + def rotate_pre_layers(self, pre_layers, Q): + transpose = False for layer in pre_layers: - dtype = layer.weight.dtype - device = layer.weight.data.device - W = layer.weight.data.to(device=device, dtype=torch.float64) - layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype) + layer.weight.data, _ = self.rotate_weight(layer.weight, None, Q, transpose) def rotate_post_layers(self, post_layers, Q, exact_had=False): + transpose = True for layer in post_layers: - dtype = layer.weight.dtype - device = layer.weight.data.device - W = layer.weight.data.to(device=device, dtype=torch.float64) - layer.weight.data = torch.matmul(Q.T, W).to(device='cpu', dtype=dtype) + weight = layer.weight + if hasattr(layer, 'bias') and layer.bias is not None: + bias = layer.bias + else: + bias = None + R_weight, R_bias = self.rotate_weight( + weight, bias, Q, transpose + ) + layer.weight.data = R_weight + if bias is not None: + layer.bias.data = bias if exact_had and self.online_rotate: apply_exact_had_to_linear(layer, had_dim=-1, output=False) - if hasattr(layer, 'bias') and layer.bias is not None: - b = layer.bias.data.to(device=device, dtype=torch.float64) - layer.bias.data = torch.matmul(Q.T, b).to(device='cpu', dtype=dtype) - def rotate_embeddings(self, Q): + transpose = False embeddings = self.model.get_embed_layers() assert len(embeddings) == 1 for layer in embeddings: - dtype = layer.weight.data.dtype - W = layer.weight.data.to(device=self.dev, dtype=torch.float64) - layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype) + layer.weight.data, _ = self.rotate_weight( + layer.weight, None, Q, transpose + ) def rotate_head(self, Q): + transpose = False heads = self.model.get_head_layers() for layer in heads: - dtype = layer.weight.data.dtype - W = layer.weight.data.to(device=self.dev, dtype=torch.float64) - layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype) + layer.weight.data, _ = self.rotate_weight( + layer.weight, None, Q, transpose + ) def fuse_ln_fcs(self, ln, fcs): for fc in fcs: @@ -786,8 +790,7 @@ def fuse_ln_fcs(self, ln, fcs): fc.bias = torch.nn.Parameter( torch.zeros(fc.out_features, dtype=torch.float64) ) - fc.bias.data = fc.bias.data.double().to(device=W.device) \ - + torch.matmul(W, ln.bias.double()) + fc.bias.data = fc.bias.data.double() + torch.matmul(W, ln.bias.double()) fc.bias.data = fc.bias.data.to(fc_dtype) def remove_mean_from_embed(self): @@ -809,15 +812,15 @@ def bake_mean_into_fc(self, fc): fc.bias.data = fc.bias.data.to(fc_dtype) @torch.no_grad() - def deploy(self, quant_format, keep_device=False): + def deploy(self, quant_format): logger.info(f'-- deploy_{quant_format}_model start --') logger.info(f'quant_config : {self.quant_config}') module_mapping = { + 'fake_quant': EffcientFakeQuantLinear, + 'real_quant': RealQuantLinear, 'origin_float': OriginFloatLinear, - 'fake_quant': EffcientFakeQuantLinear } - module_mapping.update(_REALQUANT_LINEAR_MAP_) if quant_format not in module_mapping: raise NotImplementedError( @@ -826,17 +829,14 @@ def deploy(self, quant_format, keep_device=False): module = module_mapping[quant_format] self.model.replace_module_all( - module, - self.get_replacement_params(mode=quant_format, w_only=self.w_only), - keep_device=keep_device + module, self.get_replacement_params(mode=quant_format, w_only=self.w_only) ) logger.info(f'-- deploy_{quant_format}_model done --') @torch.no_grad() def copy_tokenizer(self, path): - for substring in self.config.save.get('tokenizer_file_substring', - ['token', 'merges', 'vocab']): + for substring in self.config.save.get('tokenizer_file_substring', ['token']): copy_files(self.config.model.path, path, substring) logger.info('copy tokenizer done --') @@ -852,18 +852,11 @@ def contiguous_params(self): @torch.no_grad() def save_model(self, path): - if int(os.environ['RANK']) != 0: - return - self.contiguous_params() - if self.config.model.type in ['Llava', 'InternVL2']: - self.model.vlm_model.language_model = self.model.get_model() - self.model.vlm_model.save_pretrained(path) - logger.info('save model done --') - self.copy_tokenizer(path) - copy_files(self.config.model.path, path, 'preprocessor_config') - elif self.config.model.type in ['InternOmni']: - self.model.avlm_model.language_model = self.model.get_model() - self.model.avlm_model.save_pretrained(path) + if self.online_rotate: + self.contiguous_params() + if self.config.model.type == 'Llava': + self.model.llava_model.language_model = self.model.get_model() + self.model.llava_model.save_pretrained(path) logger.info('save model done --') self.copy_tokenizer(path) copy_files(self.config.model.path, path, 'preprocessor_config') diff --git a/llmc/compression/quantization/dgq.py b/llmc/compression/quantization/dgq.py index 4109065d5..823ba4862 100644 --- a/llmc/compression/quantization/dgq.py +++ b/llmc/compression/quantization/dgq.py @@ -8,13 +8,13 @@ from .base_blockwise_quantization import BaseBlockwiseQuantization from .module_utils import _LLMC_LN_TYPES_, _TRANSFORMERS_LN_TYPES_ -from .quant import IntegerQuantizer +from .quant import Quantizer @ALGO_REGISTRY class DGQ(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.model_dtype = next(self.model.model.parameters()).dtype def w_qdq(self, module, wquantizer): @@ -28,8 +28,8 @@ def w_qdq(self, module, wquantizer): args = {} args['scales'] = s.reshape(-1, 1) args['zeros'] = zeros.reshape(-1, 1) - args['qmax'] = upper - args['qmin'] = lower + args['max_int'] = upper + args['min_int'] = lower # logger.info(f"s.shape : {s.shape}") # logger.info(f"scales.shape : {scales.shape}") # logger.info(f"zeros.shape : {zeros.shape}") @@ -43,22 +43,21 @@ def set_quant_config(self): self.quant_out = True else: self.quant_out = False - self.quant_type = self.quant_config.get('quant_type', 'int_quant') - assert self.quant_type != 'float_quant', 'DGQ do not support Float quant now.' + # set weight quant config - self.wquantizer_w4 = IntegerQuantizer(**self.quant_config['weight']['w_1']) + self.wquantizer_w4 = Quantizer(**self.quant_config['weight']['w_1']) perchannel_setting = { 'bit': self.quant_config['weight']['w_1']['bit'], 'symmetric': self.quant_config['weight']['w_1']['symmetric'], 'granularity': 'per_channel', } - self.wquantizer_w4_perchannel = IntegerQuantizer(**perchannel_setting) - self.wquantizer_w8 = IntegerQuantizer(**self.quant_config['weight']['w_2']) + self.wquantizer_w4_perchannel = Quantizer(**perchannel_setting) + self.wquantizer_w8 = Quantizer(**self.quant_config['weight']['w_2']) # set act quant config if 'act' in self.quant_config and self.quant_config['act'] is not None: self.w_only = False - self.aquantizer = IntegerQuantizer(**self.quant_config['act']) + self.aquantizer = Quantizer(**self.quant_config['act']) else: self.w_only = True @@ -191,12 +190,12 @@ def search_scale_zero_layer(self, layer, input_feat): _, scales, zeros, - qmax, - qmin, + max_int, + min_int, ) = self.wquantizer_w4_perchannel.get_tensor_qparams(weight_OxG) # Perchannel do not need reshape and restore tensor. weight_OxG_fq = self.wquantizer_w4_perchannel.quant_dequant( - weight_OxG, scales, zeros, qmax, qmin + weight_OxG, scales, zeros, max_int, min_int ) if not self.w_only: inp_LxG_fq = self.a_qdq(inp_LxG) @@ -225,8 +224,8 @@ def search_scale_zero_layer(self, layer, input_feat): _, qscales_8, zeros, - qmax, - qmin, + max_int, + min_int, ) = self.wquantizer_w8.get_tensor_qparams( weight_tmp.clamp(-w_max * ratio, w_max * ratio) ) diff --git a/llmc/compression/quantization/gptq.py b/llmc/compression/quantization/gptq.py index c8b0a9aea..f2f4319bd 100644 --- a/llmc/compression/quantization/gptq.py +++ b/llmc/compression/quantization/gptq.py @@ -17,8 +17,8 @@ @ALGO_REGISTRY class GPTQ(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.dev = torch.device('cuda') self.model_dtype = next(self.model.model.parameters()).dtype self.add_quant_config() @@ -64,7 +64,7 @@ def hessian_sorting(self, name): if self.actorder: perm = torch.cat( - [descending_ids[self.n_out:], descending_ids[:self.self.n_out]] + [descending_ids[self.n_out:], descending_ids[: self.self.n_out]] ) else: perm = torch.cat( @@ -98,20 +98,19 @@ def block_transform_true_sequential(self, block, input_feat): torch.cuda.empty_cache() self.subset_transform(subset['layers']) - if self.quant_out: - self.model.replace_module_subset( - FakeQuantLinear, - block, - subset, - self.block_idx, - self.get_replacement_params('fake_quant', w_only=True), - ) + self.model.replace_module_subset( + FakeQuantLinear, + block, + subset, + self.block_idx, + self.get_replacement_params('fake_quant', w_only=True), + ) @torch.no_grad() def block_transform(self, block, input_feat, block_kwargs): logger.info(f'Start transform the {self.block_idx+1}-th block') if self.online_rotate: - self.replace_rotate_linears(block) + self.replace_rotate_fcs(block) if self.owq and not hasattr(self, 'n_out_dict'): named_linears = self.model.get_block_linears(block) self.n_out_dict = {} @@ -149,7 +148,7 @@ def initialize_qparams_and_prepare_weights(self, layer, name): self.qparams = {} self.columns = self.layers_cache[name]['columns'] self.n_out = self.n_out_dict[name] if self.owq else 0 - self.n_nonout = self.columns - self.n_out + self.n_nonout = layer.weight.data.shape[1] - self.n_out if self.actorder or self.owq: self.hessian_sorting(name) @@ -238,18 +237,14 @@ def weight_transform(self, W, Hinv, Losses, tmp): for i in range(count): w, d = W1[:, i], Hinv1[i, i] + idx = i1 + i + if self.wquantizer.granularity == 'per_group': - idx = i1 + i - if not self.static_groups: - if (i1 + i) % self.wquantizer.group_size == 0: - column_tensors = W[ - :, - (i1 + i):min( - (i1 + i + self.wquantizer.group_size), - (self.columns - self.n_out), - ), - ] - self.search_column_qparams(column_tensors, idx) + if not self.static_groups and idx % self.wquantizer.group_size == 0: + col_end = min( + idx + self.wquantizer.group_size, self.columns - self.n_out + ) + self.search_column_qparams(W[:, idx:col_end], idx) else: if self.actorder: idx = self.perm[idx] @@ -259,8 +254,8 @@ def weight_transform(self, W, Hinv, Losses, tmp): w.unsqueeze(1), self.qparams['scale'], self.qparams['zero'], - self.qparams['qmax'], - self.qparams['qmin'], + self.qparams['max_int'], + self.qparams['min_int'], ).squeeze(1) tmp1[:, i] = w @@ -286,7 +281,7 @@ def add_batch(self, layer, name, inp, out): ): if isinstance(layer, RotateLinear): # online rotate - inp = layer.rotater.rotate(inp) + inp = layer.a_rotater.rotate(inp) if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() @@ -346,21 +341,21 @@ def collect_model_qparams(self): tensor, scales, zeros, - qmax, - qmin, + max_int, + min_int, ) = self.wquantizer.get_tensor_qparams(m.weight.data) m = m.to(self.model_dtype) m.cpu() m.register_buffer('buf_scales', scales) m.register_buffer('buf_zeros', zeros) - m.register_buffer('buf_qmax', torch.tensor(qmax)) - m.register_buffer('buf_qmin', torch.tensor(qmin)) + m.register_buffer('buf_max_int', torch.tensor(max_int)) + m.register_buffer('buf_min_int', torch.tensor(min_int)) @torch.no_grad() def split_qparams(self, qparams): group_qparams = [] - group_num = math.ceil(self.columns / self.wquantizer.group_size) - qparams = qparams.reshape(math.ceil(qparams.shape[0] / group_num), -1) + group_num = self.columns // self.wquantizer.group_size + qparams = qparams.reshape(qparams.shape[0] // group_num, -1) qparams = qparams.t() group_qparams = list(torch.split(qparams, 1, dim=0)) for i in range(len(group_qparams)): @@ -384,11 +379,11 @@ def merge_qparams(self, qparams): @torch.no_grad() def search_column_qparams(self, c_tensor, idx): - _, scale, zero, qmax, qmin = self.wquantizer.get_tensor_qparams(c_tensor) + _, scale, zero, max_int, min_int = self.wquantizer.get_tensor_qparams(c_tensor) self.qparams['scale'] = scale self.qparams['zero'] = zero - self.qparams['qmax'] = qmax - self.qparams['qmin'] = qmin + self.qparams['max_int'] = max_int + self.qparams['min_int'] = min_int qparams = copy.deepcopy(self.qparams) self.groups[idx // self.wquantizer.group_size] = qparams @@ -397,28 +392,27 @@ def search_layer_qparams(self, layer): scales = layer.buf_scales zeros = layer.buf_zeros scales = self.merge_qparams(scales) - if not self.wquantizer.sym: - zeros = self.merge_qparams(zeros) + zeros = self.merge_qparams(zeros) self.qparams['scale'], self.qparams['zero'] = scales, zeros - self.qparams['qmax'] = layer.buf_qmax - self.qparams['qmin'] = layer.buf_qmin + self.qparams['max_int'] = layer.buf_max_int + self.qparams['min_int'] = layer.buf_min_int @torch.no_grad() def search_group_qparams(self, layer): scales = layer.buf_scales zeros = layer.buf_zeros self.group_scales = self.split_qparams(scales) - if not self.wquantizer.sym: + if zeros is not None: self.group_zeros = self.split_qparams(zeros) for i in range(len(self.group_scales)): qparams = {} qparams['scale'] = self.group_scales[i] - if not self.wquantizer.sym: + if zeros is not None: qparams['zero'] = self.group_zeros[i] else: - qparams['zero'] = torch.tensor(0.0) - qparams['qmax'] = layer.buf_qmax - qparams['qmin'] = layer.buf_qmin + qparams['zero'] = None + qparams['max_int'] = layer.buf_max_int + qparams['min_int'] = layer.buf_min_int self.groups.append(qparams) @torch.no_grad() @@ -429,11 +423,9 @@ def update_model_qparams(self, layer): _scales.append(g['scale']) _zeros.append(g['zero']) scales = self.merge_qparams(_scales) + zeros = self.merge_qparams(_zeros) layer.buf_scales = copy.deepcopy(scales) - - if not self.wquantizer.sym: - zeros = self.merge_qparams(_zeros) - layer.buf_zeros = copy.deepcopy(zeros) + layer.buf_zeros = copy.deepcopy(zeros) @torch.no_grad() def w_q(self, module, wquantizer): @@ -441,8 +433,8 @@ def w_q(self, module, wquantizer): args = {} args['scales'] = module.buf_scales args['zeros'] = module.buf_zeros - args['qmax'] = module.buf_qmax - args['qmin'] = module.buf_qmin + args['max_int'] = module.buf_max_int + args['min_int'] = module.buf_min_int args['scales'] = args['scales'].to(self.model_dtype) weight, scales, zeros = wquantizer.real_quant_weight_static(weight, args) @@ -461,8 +453,8 @@ def w_qdq(self, module, wquantizer): args['zeros'] = module.buf_zeros else: args['zeros'] = None - args['qmax'] = module.buf_qmax - args['qmin'] = module.buf_qmin + args['max_int'] = module.buf_max_int + args['min_int'] = module.buf_min_int if self.owq: fp_weight = weight[:, module.buf_n_nonout:] @@ -480,7 +472,7 @@ def w_qdq(self, module, wquantizer): @torch.no_grad() def deploy(self, quant_format): - if quant_format not in ['fake_quant', 'origin_float']: + if quant_format == 'real_quant': assert not self.need_perm super().deploy(quant_format) self.model.convert_dtype(self.model_dtype) diff --git a/llmc/compression/quantization/hadamard_utils.py b/llmc/compression/quantization/hadamard_utils.py index 2a5f4b144..81c1e15d5 100644 --- a/llmc/compression/quantization/hadamard_utils.py +++ b/llmc/compression/quantization/hadamard_utils.py @@ -11,11 +11,23 @@ 'If you need it, please install it firstly.' ) -# from .module_utils import RotateLinear # Adapted from # https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py +class HadamardTransform(torch.autograd.Function): + """The unnormalized Hadamard transform (i.e. without dividing by + sqrt(2))""" + + @staticmethod + def forward(ctx, u): + return fast_hadamard_transform.hadamard_transform(u) + + @staticmethod + def backward(ctx, grad): + return fast_hadamard_transform.hadamard_transform(grad) + + def get_hadK(n, transpose=False): hadK, K = None, None if n % 172 == 0: # llama-2-7b up @@ -109,25 +121,68 @@ def random_hadamard_matrix(size, device): def matmul_hadU_cuda(X, hadK, K): n = X.shape[-1] if K == 1: - return fast_hadamard_transform.hadamard_transform( - X.contiguous(), 1.0 / torch.tensor(n).sqrt() - ) + return HadamardTransform.apply(X.contiguous()) / torch.tensor(n).sqrt() # if transpose: # hadK = hadK.T.contiguous() input = X.view(-1, K, n // K) - input = fast_hadamard_transform.hadamard_transform( - input.contiguous(), 1.0 / torch.tensor(n).sqrt() - ) + input = HadamardTransform.apply(input.contiguous()) / torch.tensor(n).sqrt() input = hadK.to(input.device).to(input.dtype) @ input return input.reshape(X.shape) - def matmul_hadUt_cuda(X, hadK, K): return matmul_hadU_cuda(X, hadK, K, transpose=True) -def apply_exact_had_to_linear(module, had_dim=-1, output=False): - # assert isinstance(module, (torch.nn.Linear, RotateLinear)) +# def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None): +# # assert isinstance(module, (torch.nn.Linear, RotateLinear)) +# in_features, out_features = module.in_features, module.out_features + +# if had_dim != -1: +# assert is_pow2(had_dim), 'Hadamard dimension must be a power of 2!' + +# W_ = module.weight.data +# dtype = W_.dtype +# dev = W_.device +# # init_shape = W_.shape +# W_ = W_.float().cuda() + +# if had_dim == -1: +# if output: +# had_K, K = get_hadK(out_features) +# W_ = matmul_hadU_cuda(W_.t(), had_K, K).t() +# if not output: +# had_K, K = get_hadK(in_features) +# W_ = matmul_hadU_cuda(W_, had_K, K) +# else: +# # Apply Hadamard to the last had_dim chunks of the weights +# if output: +# W_ = W_.t() +# transposed_shape = W_.shape +# W_ = ( +# fast_hadamard_transform.hadamard_transform( +# W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim), +# scale=1 / math.sqrt(had_dim), +# ) +# .reshape(transposed_shape) +# .t() +# ) +# else: +# raise NotImplementedError('Not implemented (or tested) yet!') +# # n = W_.shape[1] +# # W_ = hadamard_transform( +# # W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim) +# # ).reshape(init_shape) +# module.weight.data = W_.to(device=dev, dtype=dtype) + + +def hadamard_matrix(size, device): + # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + Q = torch.eye(size) + return matmul_hadU(Q).to(device) + + +def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None): + # assert isinstance(module, torch.nn.Linear) in_features, out_features = module.in_features, module.out_features if had_dim != -1: @@ -136,7 +191,7 @@ def apply_exact_had_to_linear(module, had_dim=-1, output=False): W_ = module.weight.data dtype = W_.dtype dev = W_.device - # init_shape = W_.shape + init_shape = W_.shape W_ = W_.float().cuda() if had_dim == -1: @@ -147,27 +202,22 @@ def apply_exact_had_to_linear(module, had_dim=-1, output=False): had_K, K = get_hadK(in_features) W_ = matmul_hadU_cuda(W_, had_K, K) else: - # Apply Hadamard to the last had_dim chunks of the weights + hadK = hadamard_matrix(had_dim, 'cuda').to(torch.float64) + if R2 is not None: + hadK = R2.to(torch.float64) if output: W_ = W_.t() transposed_shape = W_.shape - W_ = ( - fast_hadamard_transform.hadamard_transform( - W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim), - scale=1 / math.sqrt(had_dim), - ) - .reshape(transposed_shape) - .t() - ) + temp = W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim) + temp = temp.to(torch.float64) @ hadK + W_ = temp.reshape(transposed_shape).t() else: - raise NotImplementedError('Not implemented (or tested) yet!') - # n = W_.shape[1] - # W_ = hadamard_transform( - # W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim) - # ).reshape(init_shape) + init_shape = W_.shape + temp = W_.reshape(-1, init_shape[-1] // had_dim, had_dim) + temp = temp.to(torch.float64) @ hadK + W_ = temp.reshape(init_shape) module.weight.data = W_.to(device=dev, dtype=dtype) - def is_pow2(n): return (n & (n - 1) == 0) and (n > 0) diff --git a/llmc/compression/quantization/hqq.py b/llmc/compression/quantization/hqq.py index 0077c401b..0784dfc47 100644 --- a/llmc/compression/quantization/hqq.py +++ b/llmc/compression/quantization/hqq.py @@ -11,8 +11,8 @@ @ALGO_REGISTRY class HQQ(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.add_quant_config() @torch.no_grad() @@ -34,13 +34,13 @@ def add_quant_config(self): ) @torch.no_grad() - def optimize_weights_proximal(self, W_f, scales, zeros, qmax, qmin): + def optimize_weights_proximal(self, W_f, scales, zeros, max_int, min_int): best_error = 1e4 current_beta = self.beta current_kappa = self.kappa scales = 1 / scales for i in range(self.iters): - W_q = torch.round(W_f * scales + zeros).clamp(qmin, qmax) + W_q = torch.round(W_f * scales + zeros).clamp(min_int, max_int) W_r = (W_q - zeros) / scales W_e = self.shrink_op(W_f - W_r, current_beta) @@ -77,17 +77,17 @@ def block_opt(self, block): tensor, org_scales, org_zeros, - qmax, - qmin, + max_int, + min_int, ) = self.wquantizer.get_tensor_qparams(tensor) best_scales, best_zeros = self.optimize_weights_proximal( - tensor, org_scales, org_zeros, qmax, qmin + tensor, org_scales, org_zeros, max_int, min_int ) layer.register_buffer('buf_scales', best_scales) layer.register_buffer('buf_zeros', best_zeros) - layer.register_buffer('buf_qmax', torch.tensor(qmax)) - layer.register_buffer('buf_qmin', torch.tensor(qmin)) + layer.register_buffer('buf_max_int', torch.tensor(max_int)) + layer.register_buffer('buf_min_int', torch.tensor(min_int)) block = block.cpu() gc.collect() @@ -99,7 +99,7 @@ def w_qdq(self, module, wquantizer): args['dim'] = 'ic' args['scales'] = module.buf_scales args['zeros'] = module.buf_zeros - args['qmax'] = module.buf_qmax - args['qmin'] = module.buf_qmin + args['max_int'] = module.buf_max_int + args['min_int'] = module.buf_min_int return wquantizer.fake_quant_weight_static(module.weight, args) diff --git a/llmc/compression/quantization/llmint8.py b/llmc/compression/quantization/llmint8.py index 29209f63a..18b6fb9a9 100644 --- a/llmc/compression/quantization/llmint8.py +++ b/llmc/compression/quantization/llmint8.py @@ -9,8 +9,8 @@ @ALGO_REGISTRY class LlmInt8(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.add_quant_config() @torch.no_grad() diff --git a/llmc/compression/quantization/module_utils.py b/llmc/compression/quantization/module_utils.py index 1ce25676a..efe9ff843 100644 --- a/llmc/compression/quantization/module_utils.py +++ b/llmc/compression/quantization/module_utils.py @@ -1,31 +1,24 @@ +import gc import math from functools import partial -import numpy as np import torch import torch.nn as nn +import torch.nn.functional as F from loguru import logger +from transformers.models.llama.modeling_llama import LlamaRMSNorm +from transformers.models.mistral.modeling_mistral import MistralRMSNorm +from transformers.models.mixtral.modeling_mixtral import MixtralRMSNorm +from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS -try: - import fast_hadamard_transform - - from .hadamard_utils import matmul_hadU_cuda -except Exception: - logger.info( - 'fast_hadamard_transform not installed. ' - 'If you need it, please install it firstly.' - ) - -from .utils import calculate_zeros_width - class LlmcLayerNorm(nn.Module): def __init__(self, weight, bias, eps, normalized_shape, elementwise_affine): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) if bias is not None: - self.register_buffer('bias', bias) + self.register_buffer("bias", bias) else: self.bias = None self.eps = eps @@ -62,16 +55,16 @@ def new(cls, module): def __repr__(self): return ( - f'LlmcLayerNorm({self.normalized_shape},' - f'eps={self.eps},' - f'elementwise_affine={self.elementwise_affine})' + f"LlmcLayerNorm({self.normalized_shape}," + f"eps={self.eps}," + f"elementwise_affine={self.elementwise_affine})" ) class LlmcLlamaRMSNorm(nn.Module): def __init__(self, weight, eps=1e-6): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) self.bias = None self.variance_epsilon = eps self.use_tmp_parameter = False @@ -82,10 +75,10 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) if self.use_tmp_parameter: weight = self.tmp_weight - bias = self.tmp_bias if hasattr(self, 'tmp_bias') else None + bias = self.tmp_bias if hasattr(self, "tmp_bias") else None else: weight = self.weight - bias = self.bias if hasattr(self, 'bias') else None + bias = self.bias if hasattr(self, "bias") else None return ( (weight * hidden_states + bias).to(input_dtype) @@ -102,7 +95,7 @@ def new(cls, module): return new_module def __repr__(self): - return 'LlmcLlamaRMSNorm()' + return "LlmcLlamaRMSNorm()" class LlmcRMSNorm(nn.Module): @@ -120,16 +113,13 @@ def forward(self, hidden_states): @classmethod @torch.no_grad() def new(cls, module): - if hasattr(module, 'eps'): - eps = module.eps - else: - eps = module.variance_epsilon + eps = module.variance_epsilon weight = module.weight new_module = cls(weight, eps) return new_module def __repr__(self): - return 'LlmcRMSNorm()' + return "LlmcRMSNorm()" class LlmcQwen2RMSNorm(LlmcLlamaRMSNorm): @@ -137,7 +127,7 @@ def __init__(self, weight, eps=1e-6): super().__init__(weight, eps) def __repr__(self): - return 'LlmcQwen2RMSNorm()' + return "LlmcQwen2RMSNorm()" class LlmcMixtralRMSNorm(LlmcLlamaRMSNorm): @@ -145,7 +135,7 @@ def __init__(self, weight, eps=1e-6): super().__init__(weight, eps) def __repr__(self): - return 'LlmcMixtralRMSNorm()' + return "LlmcMixtralRMSNorm()" class LlmcMistralRMSNorm(LlmcLlamaRMSNorm): @@ -153,7 +143,7 @@ def __init__(self, weight, eps=1e-6): super().__init__(weight, eps) def __repr__(self): - return 'LlmcMistralRMSNorm()' + return "LlmcMistralRMSNorm()" class LlmcInternLM2RMSNorm(LlmcLlamaRMSNorm): @@ -161,44 +151,74 @@ def __init__(self, weight, eps=1e-6): super().__init__(weight, eps) def __repr__(self): - return 'LlmcInternLM2RMSNorm()' - - -class LlmcGemma2RMSNorm(LlmcLlamaRMSNorm): - def __init__(self, weight, eps=1e-6): - super().__init__(weight, eps) - - def __repr__(self): - return 'LlmcGemma2RMSNorm()' + return "LlmcInternLM2RMSNorm()" + + +class OriginEmbedding(nn.Module): + def __init__(self, num_embeddings, embedding_dim, padding_idx, + max_norm, norm_type, scale_grad_by_freq, + sparse, weight): + super(OriginEmbedding, self).__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + self.weight = weight + + def forward(self, input): + return F.embedding( + input, self.weight, self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse) + + @classmethod + @torch.no_grad() + def new(cls, module): + num_embeddings = module.num_embeddings + embedding_dim = module.embedding_dim + padding_idx = module.padding_idx + max_norm = module.max_norm + norm_type = module.norm_type + scale_grad_by_freq = module.scale_grad_by_freq + sparse = module.sparse + weight = module.weight -class LlmcMiniCPMRMSNorm(LlmcLlamaRMSNorm): - def __init__(self, weight, eps=1e-6): - super().__init__(weight, eps) + new_module = cls(num_embeddings, embedding_dim, padding_idx, + max_norm, norm_type, scale_grad_by_freq, + sparse, weight) + return new_module def __repr__(self): - return 'LlmcMiniCPMRMSNorm()' + return ( + f"OriginEmbedding({self.num_embeddings}, " + f"{self.embedding_dim}, " + f"padding_idx={self.padding_idx})," + ) class OriginFloatLinear(nn.Module): def __init__(self, weight, bias, ori_module): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) if bias is not None: - self.register_buffer('bias', bias) + self.register_buffer("bias", bias) else: self.bias = None for name, buf in ori_module.named_buffers(): - if name.startswith('buf_'): + if name.startswith("buf_"): self.register_buffer(name, buf.data) - if hasattr(self, 'buf_rotate') and self.buf_rotate: - self.rotater = ori_module.rotater + + if getattr(self, "buf_a_rotate", False): + self.a_rot = ori_module.a_rot @torch.no_grad() def forward(self, x): - if hasattr(self, 'buf_rotate') and self.buf_rotate: - x = self.rotater.rotate(x) + if hasattr(self, "a_rot"): + x = self.a_rot(x, self) x = torch.functional.F.linear(x, self.weight, self.bias) return x @@ -222,98 +242,112 @@ def new(cls, module): def __repr__(self): return ( - f'OriginFloatLinear(in_features={self.in_features},' - f'out_features={self.out_features},' - f'bias={self.bias is not None})' + f"OriginFloatLinear(in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"buf_a_rotate={self.buf_a_rotate}, " + f"bias={self.bias is not None})" ) -class Rotater: - def __init__( - self, online_full_had, online_partial_had, fp32_had, K, had_K=None, had_dim=None - ): - self.online_full_had = online_full_had - self.online_partial_had = online_partial_had - self.fp32_had = fp32_had - self.K = K - self.had_K = had_K - self.had_dim = had_dim - - def rotate(self, x): - x_dtype = x.dtype - - if self.online_full_had: - if self.fp32_had: - x = matmul_hadU_cuda(x.float(), self.had_K, self.K).to(x_dtype) - else: - x = matmul_hadU_cuda(x, self.had_K, self.K) - - elif self.online_partial_had: - if self.fp32_had: - x = x.float() - init_shape = x.shape - if self.K == 1: - x = fast_hadamard_transform.hadamard_transform( - x.reshape( - -1, init_shape[-1] // self.had_dim, self.had_dim - ).transpose(1, 2), - scale=1 / math.sqrt(init_shape[-1] // self.had_dim), - ).transpose(1, 2) - else: - self.had_K = self.had_K.to(x.device) - - x = ( - self.had_K.to(x.dtype) - @ x.reshape(-1, init_shape[-1] // self.had_dim, self.had_dim) - ) / math.sqrt(init_shape[-1] // self.had_dim) - - if self.fp32_had: - x = x.to(x_dtype) - x = x.reshape(init_shape) +class RotateEmbedding(nn.Module): + def __init__(self, num_embeddings, embedding_dim, padding_idx, + max_norm, norm_type, scale_grad_by_freq, + sparse, weight, w_rot): + super(RotateEmbedding, self).__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + self.weight = weight + self.bias = None + self.w_rot = w_rot + + def forward(self, input): + + tmp_weight = self._rotate_weight() + + return F.embedding( + input, tmp_weight, self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse) + + def _rotate_weight(self): + if self.w_rot is not None: + tmp_weight, _ = self.w_rot(self) + else: + tmp_weight = self.weight + return tmp_weight + + @classmethod + @torch.no_grad() + def new(cls, module, w_rot): + + num_embeddings = module.num_embeddings + embedding_dim = module.embedding_dim + padding_idx = module.padding_idx + max_norm = module.max_norm + norm_type = module.norm_type + scale_grad_by_freq = module.scale_grad_by_freq + sparse = module.sparse + weight = module.weight - return x + new_module = cls(num_embeddings, embedding_dim, padding_idx, + max_norm, norm_type, scale_grad_by_freq, + sparse, weight, w_rot) + return new_module + + def __repr__(self): + return ( + f"RotateEmbedding({self.num_embeddings}, " + f"{self.embedding_dim}, " + f"w_rotate={self.w_rot is not None}, " + f"padding_idx={self.padding_idx})" + ) class RotateLinear(nn.Module): - def __init__( - self, - weight, - bias, - ori_module, - online_full_had, - online_partial_had, - fp32_had, - K, - had_K, - had_dim, - ): + def __init__(self, weight, bias, ori_module, w_rot, a_rot): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) if bias is not None: - self.register_buffer('bias', bias) + self.register_buffer("bias", bias) else: self.bias = None for name, buf in ori_module.named_buffers(): - if name.startswith('buf_'): + if name.startswith("buf_"): self.register_buffer(name, buf.data) - self.rotater = Rotater( - online_full_had, online_partial_had, fp32_had, K, had_K, had_dim - ) - self.register_buffer('buf_rotate', torch.tensor(True)) + self.w_rot = w_rot + self.a_rot = a_rot + + self.register_buffer("buf_w_rotate", torch.tensor(w_rot is not None)) + self.register_buffer("buf_a_rotate", torch.tensor(a_rot is not None)) def forward(self, x): - x = self.rotater.rotate(x) - x = torch.functional.F.linear(x, self.weight, self.bias) + if self.buf_a_rotate: + x = self.a_rot(x, self) + + if self.buf_w_rotate: + tmp_weight, tmp_bias = self._rotate_weight() + self.register_buffer("tmp_weight", tmp_weight, persistent=False) + self.register_buffer("tmp_bias", tmp_bias, persistent=False) + + weight = getattr(self, "tmp_weight", self.weight) + bias = getattr(self, "tmp_bias", self.bias) + x = torch.functional.F.linear(x, weight, bias) return x + + def _rotate_weight(self): + tmp_weight, tmp_bias = self.w_rot(self) + return tmp_weight, tmp_bias @classmethod @torch.no_grad() - def new( - cls, module, online_full_had, online_partial_had, fp32_had, K, had_K, had_dim - ): + def new(cls, module, w_rot, a_rot): weight = module.weight.data if module.bias is not None: bias = module.bias.data @@ -324,14 +358,9 @@ def new( weight, bias, ori_module=module, - online_full_had=online_full_had, - online_partial_had=online_partial_had, - fp32_had=fp32_had, - K=K, - had_K=had_K, - had_dim=had_dim, + w_rot=w_rot, + a_rot=a_rot ) - new_module.in_features = module.in_features new_module.out_features = module.out_features return new_module @@ -342,67 +371,72 @@ def get_func_name(cls, any_callable): return any_callable.func.__name__ return any_callable.__name__ - def register_activation_parameters(self, named_parameters): - pass - def __repr__(self): return ( - f'RotateLinear(in_features={self.in_features},' - f'out_features={self.out_features},' - f'bias={self.bias is not None},' - f'online_rotate={self.buf_rotate})' + f"RotateLinear(in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"bias={self.bias is not None}, " + f"w_rotate={self.buf_w_rotate}, " + f"a_rotate={self.buf_a_rotate})" ) class FakeQuantLinear(nn.Module): - def __init__(self, weight, bias, ori_module, w_qdq, a_qdq): + def __init__(self, weight, bias, ori_module, w_qdq, a_qdq, w_rot, a_rot): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) if bias is not None: - self.register_buffer('bias', bias) + self.register_buffer("bias", bias) else: self.bias = None self.a_qdq = a_qdq self.w_qdq = w_qdq for name, buf in ori_module.named_buffers(): - if name.startswith('buf_'): - self.register_buffer(name, buf.data) - for name, buf in ori_module.named_parameters(): - if name.startswith('buf_'): + if name.startswith("buf_"): self.register_buffer(name, buf.data) - if hasattr(self, 'buf_rotate') and self.buf_rotate: - self.rotater = ori_module.rotater - else: - self.buf_rotate = False + if getattr(self, "buf_w_rotate", False): + self.w_rot = w_rot + if getattr(self, "buf_a_rotate", False): + self.a_rot = a_rot self.dynamic_quant_weight = False self.dynamic_quant_tmp_weight = False def forward(self, x): - if hasattr(self, 'buf_rotate') and self.buf_rotate: - x = self.rotater.rotate(x) + if hasattr(self, "a_rot"): + x = self.a_rot(x, self) if self.a_qdq is not None: x = self.a_qdq(x, self) - if not hasattr(self, 'tmp_weight'): - tmp_weight = self.w_qdq(self) - self.register_buffer('tmp_weight', tmp_weight, persistent=False) - self.tmp_bias = self.bias - - elif self.dynamic_quant_weight: + if hasattr(self, "w_rot") and self.w_rot is not None: + tmp_weight, tmp_bias = self._rotate_weight() + self.register_buffer("tmp_weight", tmp_weight, persistent=False) + self.register_buffer("tmp_bias", tmp_bias, persistent=False) self.tmp_weight = self.w_qdq(self) - self.tmp_bias = self.bias - elif self.dynamic_quant_tmp_weight: - self.tmp_weight = self.w_qdq(self) + else: + if not hasattr(self, "tmp_weight"): + tmp_weight = self.w_qdq(self) + self.register_buffer("tmp_weight", tmp_weight, persistent=False) + self.tmp_bias = self.bias - x = torch.functional.F.linear(x, self.tmp_weight, self.tmp_bias) + elif self.dynamic_quant_weight: + self.tmp_weight = self.w_qdq(self) + self.tmp_bias = self.bias + + elif self.dynamic_quant_tmp_weight: + self.tmp_weight = self.w_qdq(self) + x = torch.functional.F.linear(x, self.tmp_weight, self.tmp_bias) return x + def _rotate_weight(self): + tmp_weight, tmp_bias = self.w_rot(self) + return tmp_weight, tmp_bias + @classmethod @torch.no_grad() def new(cls, module, w_qdq, a_qdq): @@ -411,14 +445,15 @@ def new(cls, module, w_qdq, a_qdq): bias = module.bias.data else: bias = None + - new_module = cls(weight, bias, ori_module=module, w_qdq=w_qdq, a_qdq=a_qdq) + new_module = cls(weight, bias, ori_module=module, w_qdq=w_qdq, a_qdq=a_qdq, w_rot=module.w_rot, a_rot=module.a_rot) new_module.in_features = module.in_features new_module.out_features = module.out_features new_module.w_qdq_name = cls.get_func_name(w_qdq) new_module.a_qdq_name = ( - cls.get_func_name(a_qdq) if a_qdq is not None else 'None' + cls.get_func_name(a_qdq) if a_qdq is not None else "None" ) return new_module @@ -433,37 +468,36 @@ def register_activation_parameters(self, named_parameters): def __repr__(self): return ( - f'FakeQuantLinear(in_features={self.in_features},' - f'out_features={self.out_features}, bias={self.bias is not None},' - f'weight_quant={self.w_qdq_name},' - f'act_quant={self.a_qdq_name},' - f'online_rotate={self.buf_rotate})' + f"FakeQuantLinear(in_features={self.in_features}," + f"out_features={self.out_features}, bias={self.bias is not None}," + f"weight_quant={self.w_qdq_name}, " + f"act_quant={self.a_qdq_name}, " + f"w_rotate={self.buf_w_rotate}, " + f"a_rotate={self.buf_a_rotate}," ) class EffcientFakeQuantLinear(nn.Module): def __init__(self, weight, bias, ori_module, a_qdq): super().__init__() - self.register_buffer('weight', weight) + self.register_buffer("weight", weight) if bias is not None: - self.register_buffer('bias', bias) + self.register_buffer("bias", bias) else: self.bias = None self.a_qdq = a_qdq for name, buf in ori_module.named_buffers(): - if name.startswith('buf_'): + if name.startswith("buf_"): self.register_buffer(name, buf.data) - if hasattr(self, 'buf_rotate') and self.buf_rotate: - self.rotater = ori_module.rotater - else: - self.buf_rotate = False + if getattr(self, "buf_a_rotate", False): + self.a_rot = ori_module.a_rot @torch.no_grad() def forward(self, x): - if hasattr(self, 'buf_rotate') and self.buf_rotate: - x = self.rotater.rotate(x) + if hasattr(self, "a_rot"): + x = self.a_rot(x, self) if self.a_qdq is not None: x = self.a_qdq(x, self) @@ -473,6 +507,10 @@ def forward(self, x): @classmethod @torch.no_grad() def new(cls, module, w_qdq, a_qdq, debug_print={}): + + if hasattr(module, "w_rot") and module.w_rot is not None: + weight, bias = module.w_rot(module) + weight = w_qdq(module) if module.bias is not None: @@ -486,7 +524,7 @@ def new(cls, module, w_qdq, a_qdq, debug_print={}): new_module.out_features = module.out_features new_module.w_qdq_name = cls.get_func_name(w_qdq) new_module.a_qdq_name = ( - cls.get_func_name(a_qdq) if a_qdq is not None else 'None' + cls.get_func_name(a_qdq) if a_qdq is not None else "None" ) new_module.debug_print = debug_print return new_module @@ -499,146 +537,29 @@ def get_func_name(cls, any_callable): def __repr__(self): return ( - f'EffcientFakeQuantLinear(in_features={self.in_features},' - f'out_features={self.out_features},' - f'bias={self.bias is not None},' - f'weight_quant={self.w_qdq_name},' - f'act_quant={self.a_qdq_name},' - f'online_rotate={self.buf_rotate},' - f'debug_print={self.debug_print})' - ) - - -class VllmRealQuantLinear(nn.Module): - def __init__(self, weight, bias, scales, need_pack): - super().__init__() - weight_name = 'weight_packed' if need_pack else 'weight' - self.register_buffer(weight_name, weight) - - ( - self.register_buffer('bias', bias) - if bias is not None - else setattr(self, 'bias', None) + f"EffcientFakeQuantLinear(in_features={self.in_features}, " + f"out_features={self.out_features}, " + f"bias={self.bias is not None}, " + f"weight_quant={self.w_qdq_name}, " + f"act_quant={self.a_qdq_name}, " + f"debug_print={self.debug_print})" ) - self.register_buffer('weight_scale', scales) - - @torch.no_grad() - def forward(self, x): - raise NotImplementedError - - @classmethod - @torch.no_grad() - def new(cls, module, w_q, quant_config): - weight, scales = cls.quant_pack(module, w_q, quant_config) - if module.bias is not None: - bias = module.bias.data - else: - bias = None - need_pack = quant_config['weight'].get('need_pack', False) - new_module = cls(weight, bias, scales, need_pack) - new_module.in_features = module.in_features - new_module.out_features = module.out_features - new_module.weight_shape = weight.shape - new_module.weight_dtype = weight.dtype - new_module.scales_shape = scales.shape - new_module.scales_dtype = scales.dtype - - new_module.zeros_shape = None - new_module.zeros_dtype = None - - return new_module - - @classmethod - @torch.no_grad() - def quant_pack(cls, module, w_q, quant_config): - weight, scales, zeros = w_q(module) - need_pack = quant_config['weight'].get('need_pack', False) - if need_pack: - weight, scales = cls.pack(weight, scales, quant_config) - return weight, scales - - @classmethod - @torch.no_grad() - def pack(self, weight, scales, quant_config): - - # Packs a tensor of quantized weights stored in int8 into int32s with padding - scales = scales.to(torch.float16) - num_bits = quant_config['weight']['bit'] - - # convert to unsigned for packing - offset = pow(2, num_bits) // 2 - weight = (weight + offset).to(torch.uint8) - weight = weight.cpu().numpy().astype(np.uint32) - pack_factor = 32 // num_bits - - # pad input tensor and initialize packed output - packed_size = math.ceil(weight.shape[1] / pack_factor) - packed = np.zeros((weight.shape[0], packed_size), dtype=np.uint32) - padding = packed.shape[1] * pack_factor - weight.shape[1] - weight = np.pad(weight, pad_width=[(0, 0), (0, padding)], constant_values=0) - - # pack values - for i in range(pack_factor): - packed |= weight[:, i::pack_factor] << num_bits * i - - packed = np.ascontiguousarray(packed).view(np.int32) - int_weight = torch.from_numpy(packed) - return int_weight, scales - - def __repr__(self): - return ( - 'VllmRealQuantLinear(' - + f'in_features={self.in_features}, ' - + f'out_features={self.out_features}, ' - + f'bias={self.bias is not None}, ' - + f'weight_shape={self.weight_shape}, ' - + f'weight_dtype={self.weight_dtype}, ' - + f'scales_shape={self.scales_shape}, ' - + f'scales_dtype={self.scales_dtype}, ' - + f'zeros_shape={self.zeros_shape}, ' - + f'zeros_dtype={self.zeros_dtype})' - ) - - -class SglRealQuantLinear(VllmRealQuantLinear): - def __init__(self, weight, bias, scales, need_pack): - super().__init__(weight, bias, scales, need_pack) - - def __repr__(self): - return ( - 'SglRealQuantLinear(' - + f'in_features={self.in_features}, ' - + f'out_features={self.out_features}, ' - + f'bias={self.bias is not None}, ' - + f'weight_shape={self.weight_shape}, ' - + f'weight_dtype={self.weight_dtype}, ' - + f'scales_shape={self.scales_shape}, ' - + f'scales_dtype={self.scales_dtype}, ' - + f'zeros_shape={self.zeros_shape}, ' - + f'zeros_dtype={self.zeros_dtype})' - ) - - -class AutoawqRealQuantLinear(nn.Module): +class RealQuantLinear(nn.Module): def __init__(self, weight, bias, scales, zeros): super().__init__() - self.register_buffer('qweight', weight) - - ( - self.register_buffer('bias', bias) - if bias is not None - else setattr(self, 'bias', None) - ) - - self.register_buffer('scales', scales) + self.register_buffer("weight", weight) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + self.register_buffer("scales", scales) - ( - self.register_buffer('qzeros', zeros) - if zeros is not None - else setattr(self, 'qzeros', None) - ) + if zeros is not None: + self.register_buffer("zeros", zeros) + else: + self.zero = None @torch.no_grad() def forward(self, x): @@ -674,166 +595,98 @@ def new(cls, module, w_q, quant_config): @torch.no_grad() def quant_pack(cls, module, w_q, quant_config): weight, scales, zeros = w_q(module) - pack_version = quant_config['weight']['pack_version'] - if pack_version == 'gemm_pack': - int_weight, scales, int_zeros = \ - cls.gemm_pack(weight, scales, zeros, quant_config) - elif pack_version == 'gemv_pack': - int_weight, scales, int_zeros = \ - cls.gemv_pack(module, weight, scales, zeros, quant_config) - return int_weight, scales, int_zeros + weight, scales, zeros = cls.pack(weight, scales, zeros, quant_config) + return weight, scales, zeros @classmethod @torch.no_grad() - def gemm_pack(self, weight, scales, zeros, quant_config): - - if zeros is not None: - zeros = zeros.t().contiguous() - scales = scales.t().contiguous() - weight = weight.t().contiguous() - - bit = quant_config['weight']['bit'] - pack_num = 32 // bit - - int_weight = torch.zeros( - (weight.shape[0], weight.shape[1] // 32 * bit), - dtype=torch.int32, - device=weight.device, - ) - - for col in range(weight.shape[1] // pack_num): - if bit == 4: - order_map = [0, 2, 4, 6, 1, 3, 5, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - int_weight_col = weight[:, col * pack_num + order_map[i]] - int_weight[:, col] |= int_weight_col << (i * bit) - - if zeros is not None: - int_zeros = torch.zeros( - (zeros.shape[0], zeros.shape[1] // 32 * bit), - dtype=torch.int32, - device=zeros.device, - ) - - for col in range(zeros.shape[1] // pack_num): - if bit == 4: - order_map = [0, 2, 4, 6, 1, 3, 5, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - intzero_col = zeros[:, col * pack_num + order_map[i]] - int_zeros[:, col] |= intzero_col << (i * bit) + def pack(self, weight, scales, zeros, quant_config): + if quant_config["weight"]["bit"] == 8: + if zeros is not None: + zeros = zeros.view(weight.shape[0], -1) + scales = scales.view(weight.shape[0], -1) + return weight, scales, zeros + + h1, h2 = weight.shape + # pack 8 int4 in an int32 number, pack 16 int2 in an int32 number. + bit = quant_config["weight"]["bit"] + tmp = 32 // bit + + if ( + quant_config["weight"]["group_size"] != -1 + and quant_config["weight"]["granularity"] == "per_group" + ): + group_size = quant_config["weight"]["group_size"] else: - int_zeros = None + group_size = h2 - return int_weight, scales, int_zeros + assert h1 % tmp == 0 and h2 % tmp == 0, "H1 {} H2 {}".format(h1, h2) + assert h2 % group_size == 0, "H1 {} H2 {}".format(h1, h2) - @classmethod - @torch.no_grad() - def gemv_pack(self, module, weight, scales, zeros, quant_config): - - bit = quant_config['weight']['bit'] - group_size = quant_config['weight']['group_size'] - pack_num = 32 // bit - - q_scales = torch.zeros( - ( - scales.shape[0], - calculate_zeros_width(module.in_features, group_size) * pack_num, - ), - dtype=torch.float16, - device=scales.device, - ) - q_scales[:, : scales.shape[1]] = scales - - int_weight = torch.zeros( - (weight.shape[0], weight.shape[1] // 32 * bit), - dtype=torch.int32, - device=weight.device, - ) - - for col in range(weight.shape[1] // pack_num): - if bit == 4: - order_map = [0, 1, 2, 3, 4, 5, 6, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - int_weight_col = weight[:, col * pack_num + order_map[i]] - int_weight[:, col] |= int_weight_col << (i * bit) + weight = weight.cuda() + int_weight = torch.empty(h1, h2 // tmp).to(torch.int32).cuda() + # Weight pack in row. + for pack in range(0, h2, tmp): + for i in range(tmp): + int_weight[:, pack // tmp] += weight[:, pack + i] << (i * bit) + weight = weight.cpu() + int_weight = int_weight.cpu() + del weight if zeros is not None: - int_zeros = torch.zeros( - (zeros.shape[0], calculate_zeros_width(module.in_features, group_size)), - dtype=torch.int32, - device=zeros.device, - ) - - for col in range(zeros.shape[1] // pack_num): - if bit == 4: - order_map = [0, 1, 2, 3, 4, 5, 6, 7] - else: - raise NotImplementedError('Only 4-bit are supported for now.') - for i in range(pack_num): - if col * pack_num + order_map[i] >= zeros.shape[1]: - continue - int_zero_col = zeros[:, col * pack_num + order_map[i]] - int_zeros[:, col] |= int_zero_col << (i * bit) + zeros = zeros.cuda() + int_zeros = torch.zeros(h1 // tmp, h2 // group_size).to(torch.int32).cuda() + zeros = zeros.view(h1, -1) + # zero point pack in col. + for pack in range(0, h1, tmp): + for i in range(tmp): + int_zeros[pack // tmp, :] += zeros[pack + i, :] << (i * bit) + zeros = zeros.cpu() + int_zeros = int_zeros.cpu() + del zeros else: int_zeros = None - return int_weight, q_scales, int_zeros + gc.collect() + torch.cuda.empty_cache() - def __repr__(self): - return ( - 'AutoawqRealQuantLinear(' - + f'in_features={self.in_features}, ' - + f'out_features={self.out_features}, ' - + f'bias={self.bias is not None}, ' - + f'weight_shape={self.weight_shape}, ' - + f'weight_dtype={self.weight_dtype}, ' - + f'scales_shape={self.scales_shape}, ' - + f'scales_dtype={self.scales_dtype}, ' - + f'zeros_shape={self.zeros_shape}, ' - + f'zeros_dtype={self.zeros_dtype})' - ) - - -class MlcllmRealQuantLinear(AutoawqRealQuantLinear): - def __init__(self, weight, bias, scales, zeros): - super().__init__(weight, bias, scales, zeros) + scales = scales.view(h1, -1) + return int_weight, scales, int_zeros def __repr__(self): return ( - 'MlcllmRealQuantLinear(' - + f'in_features={self.in_features}, ' - + f'out_features={self.out_features}, ' - + f'bias={self.bias is not None}, ' - + f'weight_shape={self.weight_shape}, ' - + f'weight_dtype={self.weight_dtype}, ' - + f'scales_shape={self.scales_shape}, ' - + f'scales_dtype={self.scales_dtype}, ' - + f'zeros_shape={self.zeros_shape}, ' - + f'zeros_dtype={self.zeros_dtype})' + "RealQuantLinear(" + + f"in_features={self.in_features}, " + + f"out_features={self.out_features}, " + + f"bias={self.bias is not None}, " + + f"weight_shape={self.weight_shape}, " + + f"weight_dtype={self.weight_dtype}, " + + f"scales_shape={self.scales_shape}, " + + f"scales_dtype={self.scales_dtype}, " + + f"zeros_shape={self.zeros_shape}, " + + f"zeros_dtype={self.zeros_dtype})" ) -_TRANSFORMERS_LN_TYPES_ = ALL_LAYERNORM_LAYERS +_TRANSFORMERS_LN_TYPES_ = ALL_LAYERNORM_LAYERS + [ + MistralRMSNorm, + MixtralRMSNorm, + Qwen2RMSNorm, + LlamaRMSNorm, + nn.LayerNorm, +] _TRANSFORMERS_LINEAR_TYPES_ = [nn.Linear] _MODEL_LN_TYPES_PAIRS_ = { - 'Llama': LlmcLlamaRMSNorm, - 'Llava': LlmcLlamaRMSNorm, - 'Mistral': LlmcMistralRMSNorm, - 'Mixtral': LlmcMixtralRMSNorm, - 'Interlm2': LlmcInternLM2RMSNorm, - 'Qwen2': LlmcQwen2RMSNorm, - 'Gemma2': LlmcGemma2RMSNorm, - 'MiniCPM': LlmcMiniCPMRMSNorm, - 'Starcoder': LlmcLayerNorm, - 'Opt': LlmcLayerNorm, - 'Bloom': LlmcLayerNorm, + "Llama": LlmcLlamaRMSNorm, + "Llava": LlmcLlamaRMSNorm, + "Mistral": LlmcMistralRMSNorm, + "Mixtral": LlmcMixtralRMSNorm, + "Interlm2": LlmcInternLM2RMSNorm, + "Qwen2": LlmcQwen2RMSNorm, + "Starcoder": LlmcLayerNorm, + "Opt": LlmcLayerNorm, + "Bloom": LlmcLayerNorm, } @@ -845,8 +698,6 @@ def __repr__(self): LlmcMistralRMSNorm, LlmcMixtralRMSNorm, LlmcInternLM2RMSNorm, - LlmcGemma2RMSNorm, - LlmcMiniCPMRMSNorm, ] @@ -855,16 +706,5 @@ def __repr__(self): RotateLinear, FakeQuantLinear, EffcientFakeQuantLinear, - VllmRealQuantLinear, - SglRealQuantLinear, - AutoawqRealQuantLinear, - MlcllmRealQuantLinear + RealQuantLinear, ] - - -_REALQUANT_LINEAR_MAP_ = { - 'vllm_quant': VllmRealQuantLinear, - 'sgl_quant': SglRealQuantLinear, - 'autoawq_quant': AutoawqRealQuantLinear, - 'mlcllm_quant': MlcllmRealQuantLinear -} diff --git a/llmc/compression/quantization/ntweak.py b/llmc/compression/quantization/ntweak.py index b758bf2a8..022768a3d 100644 --- a/llmc/compression/quantization/ntweak.py +++ b/llmc/compression/quantization/ntweak.py @@ -1,6 +1,7 @@ import functools import gc import math +import pdb from contextlib import nullcontext from math import inf @@ -19,8 +20,8 @@ @ALGO_REGISTRY class NormTweaking(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.add_quant_config() model_type = self.config['model']['type'] @@ -139,6 +140,7 @@ def ntweak_train(self, block): if not math.isfinite(loss.item()): logger.info('Loss is NAN, stopping training') + pdb.set_trace() loss_list.append(loss.data) optimizer.zero_grad() diff --git a/llmc/compression/quantization/omniq.py b/llmc/compression/quantization/omniq.py index 8c5ff7c33..bddab35b3 100644 --- a/llmc/compression/quantization/omniq.py +++ b/llmc/compression/quantization/omniq.py @@ -2,6 +2,7 @@ import functools import gc import math +import pdb import random from contextlib import nullcontext from math import inf @@ -24,8 +25,8 @@ @ALGO_REGISTRY class OmniQuant(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.add_quant_config() model_type = self.config['model']['type'] @@ -213,6 +214,7 @@ def omni_train(self, block): if not math.isfinite(loss.item()): logger.info('Loss is NAN, stopping training') + pdb.set_trace() loss_list.append(loss.data) optimizer.zero_grad() @@ -305,7 +307,7 @@ def register_lwc_parameters(self, block, input_feat, init_value=4.0): torch.ones( (dim, 1), device=self.dev, - dtype=self.dtype, + # dtype=self.dtype, ) * init_value ) @@ -313,7 +315,7 @@ def register_lwc_parameters(self, block, input_feat, init_value=4.0): torch.ones( (dim, 1), device=self.dev, - dtype=self.dtype, + # dtype=self.dtype, ) * init_value ) @@ -383,13 +385,12 @@ def get_clip_parameters(self, input_feat, n, m): inputs = input_feat[n] max_val, min_val = self.auto_clip_layer( - n, m.weight.data, inputs, - n_sample_token=self.config.calib.get('seq_len', None), + n_sample_token=self.config.calib.seq_len, ) - up_factor, low_factor = self.get_clip_factor(m, min_val, max_val, n) + up_factor, low_factor = self.get_clip_factor(m, min_val, max_val) up_param = nn.Parameter(up_factor) low_param = nn.Parameter(low_factor) diff --git a/llmc/compression/quantization/osplus.py b/llmc/compression/quantization/osplus.py index a433c0c23..d95caf9d2 100644 --- a/llmc/compression/quantization/osplus.py +++ b/llmc/compression/quantization/osplus.py @@ -17,9 +17,9 @@ @ALGO_REGISTRY class OsPlus(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): + def __init__(self, model, quant_config, input, config): torch.set_grad_enabled(False) - super().__init__(model, quant_config, input, padding_mask, config) + super().__init__(model, quant_config, input, config) special_config = self.quant_config.get('special', {}) self.weight_clip = special_config.get('weight_clip', False) @@ -106,7 +106,7 @@ def register_hooks(feat_dict): self.auto_clip( block, clip_input_feat, - n_sample_token=self.config.calib.get('seq_len', None), + n_sample_token=self.config.calib.seq_len, eps=3e-1, ) diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py index f49f204f0..77c27ece1 100644 --- a/llmc/compression/quantization/quant.py +++ b/llmc/compression/quantization/quant.py @@ -3,30 +3,62 @@ from torch import nn -class BaseQuantizer(object): +class Quantizer: def __init__(self, bit, symmetric, granularity, **kwargs): - self.bit = bit + if isinstance(bit, str): + # for fp quantization, format: ExMy + self.use_fp = True + self.e_bits = int(bit[1]) + self.m_bits = int(bit[-1]) + self.sign_bits = 1 + self.bit = self.e_bits + self.m_bits + self.sign_bits + self.default_bias = 2 ** (self.e_bits - 1) + else: + self.use_fp = False + self.bit = bit self.sym = symmetric self.granularity = granularity self.kwargs = kwargs - self.calib_algo = self.kwargs.get('calib_algo', 'minmax') + if 'calib_algo' in self.kwargs: + self.calib_algo = self.kwargs['calib_algo'] + else: + self.calib_algo = 'minmax' + + if 'qmax_to_tensor' in self.kwargs and self.kwargs['qmax_to_tensor']: + if self.sym: + self.max_int = torch.tensor(2 ** (self.bit - 1) - 1).cuda() + self.min_int = torch.tensor(-(2 ** (self.bit - 1))).cuda() + else: + self.max_int = torch.tensor(2**self.bit - 1).cuda() + self.min_int = torch.tensor(0.0).cuda() + else: + if self.sym: + self.max_int = 2 ** (self.bit - 1) - 1 + self.min_int = -(2 ** (self.bit - 1)) + else: + self.max_int = 2**self.bit - 1 + self.min_int = 0.0 if self.granularity == 'per_group': self.group_size = self.kwargs['group_size'] elif self.granularity == 'per_head': self.head_num = self.kwargs['head_num'] - self.mse_b_num = self.kwargs.get('mse_b_num', 1) - - if self.kwargs.get('ste', False): + if 'ste' in self.kwargs and self.kwargs['ste']: self.round_func = lambda x: (x.round() - x).detach() + x else: self.round_func = torch.round - self.round_zp = self.kwargs.get('round_zp', True) + self.round_zp = 'round_zp' not in self.kwargs or self.kwargs['round_zp'] self.sigmoid = torch.nn.Sigmoid() + def __repr__(self): + return ( + f'Quantizer(bit={self.bit}, sym={self.sym}, granularity={self.granularity},' + f'kwargs={self.kwargs}, max_int={self.max_int}, min_int={self.min_int})' + ) + def get_tensor_range(self, tensor, args={}): if self.calib_algo == 'minmax': return self.get_minmax_range(tensor) @@ -35,7 +67,7 @@ def get_tensor_range(self, tensor, args={}): elif self.calib_algo == 'learnable': return self.get_learnable_range(tensor, **args) else: - raise ValueError(f'Unsupported calibration algorithm: {self.calib_algo}') + logger.info('Calibration Algorithm Not Found!') def get_minmax_range(self, tensor): if self.granularity == 'per_tensor': @@ -47,21 +79,20 @@ def get_minmax_range(self, tensor): return (min_val, max_val) - def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256): - - assert self.mse_b_num >= 1 and tensor.shape[0] % self.mse_b_num == 0, \ - 'Batch number must be divisible by tensor.shape[0],' - bs = tensor.shape[0] // self.mse_b_num + @torch.no_grad() + def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=1024): + if tensor.shape[0] % bs != 0: + bs = tensor.shape[0] tensor = tensor.float() min_val, max_val = self.get_minmax_range(tensor) dev = tensor.device - for b_num in range(self.mse_b_num): - _tensor = tensor[b_num * bs: (b_num + 1) * bs, :] + for b_num in range(tensor.shape[0] // bs): + _tensor = tensor[b_num * bs : (b_num + 1) * bs, :] _min_val, _max_val = ( - min_val[b_num * bs: (b_num + 1) * bs, :], - max_val[b_num * bs: (b_num + 1) * bs, :], + min_val[b_num * bs : (b_num + 1) * bs, :], + max_val[b_num * bs : (b_num + 1) * bs, :], ) best = torch.full([_tensor.shape[0]], float('inf'), device=dev) @@ -74,22 +105,18 @@ def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256): xmin = p * _min_val xmax = p * _max_val - if self.quant_type == 'float-quant' and not self.use_qtorch: - clip_tensor, scales = self.get_float_qparams( - _tensor, (xmin, xmax), dev + if not self.use_fp: + scales, zeros, max_int, min_int = self.get_qparams( + (xmin, xmax), dev ) - zeros, qmin, qmax = 0, None, None q_tensor = self.quant_dequant( - clip_tensor, scales, zeros, qmax, qmin + _tensor, scales, zeros, max_int, min_int ) - else: - scales, zeros, qmax, qmin = self.get_qparams( - (xmin, xmax), dev - ) - q_tensor = self.quant_dequant( - _tensor, scales, zeros, qmax, qmin + clip_tensor, scales = self.get_fp_qparams( + _tensor, (xmin, xmax), dev ) + q_tensor = self.fp_quant_dequant(clip_tensor, scales) q_tensor -= _tensor q_tensor.abs_() @@ -104,8 +131,8 @@ def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256): best_max_val[tmp] = xmax[tmp] ( - min_val[b_num * bs: (b_num + 1) * bs, :], - max_val[b_num * bs: (b_num + 1) * bs, :], + min_val[b_num * bs : (b_num + 1) * bs, :], + max_val[b_num * bs : (b_num + 1) * bs, :], ) = (best_min_val, best_max_val) return (min_val, max_val) @@ -128,94 +155,68 @@ def get_learnable_range(self, tensor, lowbound_factor=None, upbound_factor=None) def get_qparams(self, tensor_range, device): min_val, max_val = tensor_range[0], tensor_range[1] - qmin = self.qmin - qmax = self.qmax + max_int = self.max_int + min_int = self.min_int if self.sym: abs_max = torch.max(max_val.abs(), min_val.abs()) abs_max = abs_max.clamp(min=1e-5) - scales = abs_max / qmax + scales = abs_max / max_int zeros = torch.tensor(0.0) else: - scales = (max_val - min_val).clamp(min=1e-5) / (qmax - qmin) - zeros = (qmin - torch.round(min_val / scales)).clamp(qmin, qmax) + scales = (max_val - min_val).clamp(min=1e-5) / max_int + zeros = (-torch.round(min_val / scales)).clamp(min_int, max_int) if not self.round_zp: - zeros = qmin - (min_val / scales) - return scales, zeros, qmax, qmin + zeros = -min_val / scales + return scales, zeros, max_int, min_int def get_tensor_qparams(self, tensor, args={}): tensor = self.reshape_tensor(tensor) tensor_range = self.get_tensor_range(tensor, args) - scales, zeros, qmax, qmin = self.get_qparams(tensor_range, tensor.device) - return tensor, scales, zeros, qmax, qmin + scales, zeros, max_int, min_int = self.get_qparams(tensor_range, tensor.device) + return tensor, scales, zeros, max_int, min_int - def reshape_tensor(self, tensor, allow_padding=False): - if self.granularity == 'per_group': - if tensor.shape[1] >= self.group_size: - if tensor.shape[1] % self.group_size == 0: - t = tensor.reshape(-1, self.group_size) - elif allow_padding: - deficiency = self.group_size - tensor.shape[1] % self.group_size - prefix = tensor.shape[:-1] - pad_zeros = torch.zeros( - (*prefix, deficiency), - device=tensor.device, dtype=tensor.dtype) - t = torch.cat( - (tensor, pad_zeros), - dim=-1).reshape(-1, self.group_size) - else: - raise ValueError( - f'Dimension {tensor.shape[-1]} ' - f'not divisible by group size {self.group_size}' - ) - else: - t = tensor - elif self.granularity == 'per_head': - t = tensor.reshape(self.head_num, -1) - else: - t = tensor - return t + def get_fp_tensor_qparams(self, tensor, args={}): + tensor = self.reshape_tensor(tensor) + tensor_range = self.get_tensor_range(tensor, args) + clip_tensor, scales = self.get_fp_qparams(tensor, tensor_range, tensor.device) + return clip_tensor, scales - def restore_tensor(self, tensor, shape): - if tensor.shape == shape: - t = tensor - else: - try: - t = tensor.reshape(shape) - except RuntimeError: - deficiency = self.group_size - shape[1] % self.group_size - t = tensor.reshape(*shape[:-1], -1)[..., :-deficiency] - return t + def get_fp_qparams(self, tensor, tensor_range, device): + min_val, max_val = tensor_range[0], tensor_range[1] + maxval = torch.max(max_val, -min_val) + e_bits = torch.tensor(self.e_bits, dtype=torch.float32).cuda() + m_bits = torch.tensor(self.m_bits, dtype=torch.float32).cuda() -class IntegerQuantizer(BaseQuantizer): - def __init__(self, bit, symmetric, granularity, **kwargs): - super().__init__(bit, symmetric, granularity, **kwargs) - self.quant_type = 'int-quant' - if 'int_range' in self.kwargs: - self.qmin = self.kwargs['int_range'][0] - self.qmax = self.kwargs['int_range'][1] - else: - if self.sym: - self.qmin = -(2 ** (self.bit - 1)) - self.qmax = 2 ** (self.bit - 1) - 1 - else: - self.qmin = 0.0 - self.qmax = 2**self.bit - 1 + if maxval.shape[0] != 1 and len(maxval.shape) != len(tensor.shape): + maxval = maxval.view([-1] + [1] * (len(tensor.shape) - 1)) - if self.kwargs.get('qmax_to_tensor'): - self.qmin = torch.tensor(self.qmin).cuda() - self.qmax = torch.tensor(self.qmax).cuda() + if e_bits >= 5: + maxval = maxval.to(dtype=torch.float32) + + bias = 2**e_bits - torch.log2(maxval) + torch.log2(2 - 2 ** (-m_bits)) - 1 + + xc = torch.min(torch.max(tensor, -maxval), maxval) - def quant(self, tensor, scales, zeros, qmax, qmin): + log_scales = torch.clamp( + (torch.floor(torch.log2(torch.abs(xc)) + bias)).detach(), 1.0 + ) + + scales = 2.0 ** (log_scales - m_bits - bias) + + return xc, scales + + def quant(self, tensor, scales, zeros, max_int, min_int): if self.round_zp: tensor = torch.clamp( - self.round_func(tensor / scales) + zeros, qmin, qmax + self.round_func(tensor / scales) + zeros, min_int, max_int ) else: + tensor = torch.clamp( self.round_func(tensor / scales.clamp_min(1e-9) + zeros), - qmin, - qmax, + min_int, + max_int, ) return tensor @@ -223,11 +224,34 @@ def dequant(self, tensor, scales, zeros): tensor = (tensor - zeros) * scales return tensor - def quant_dequant(self, tensor, scales, zeros, qmax, qmin): - tensor = self.quant(tensor, scales, zeros, qmax, qmin) + def fp_quant_dequant(self, tensor, scales): + tensor = self.round_func(tensor / scales) * scales + return tensor + + def quant_dequant(self, tensor, scales, zeros, max_int, min_int): + tensor = self.quant(tensor, scales, zeros, max_int, min_int) tensor = self.dequant(tensor, scales, zeros) return tensor + def reshape_tensor(self, tensor): + if self.granularity == 'per_group': + if tensor.shape[1] >= self.group_size: + t = tensor.reshape(-1, self.group_size) + else: + t = tensor + elif self.granularity == 'per_head': + t = tensor.reshape(self.head_num, -1) + else: + t = tensor + return t + + def restore_tensor(self, tensor, shape): + if tensor.shape == shape: + t = tensor + else: + t = tensor.reshape(shape) + return t + def fake_quant_act_static(self, act, args={}): if 'int_indices' in args: q_act = act[:, :, args['int_indices']] @@ -242,14 +266,14 @@ def fake_quant_act_static(self, act, args={}): org_act_shape = q_act.shape org_act_dtype = q_act.dtype - scales, zeros, qmax, qmin = ( + scales, zeros, max_int, min_int = ( args['scales'], args['zeros'], - args['qmax'], - args['qmin'], + args['max_int'], + args['min_int'], ) q_act = self.reshape_tensor(q_act) - q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin) + q_act = self.quant_dequant(q_act, scales, zeros, max_int, min_int) q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype) if 'current_bit' in args: @@ -263,6 +287,7 @@ def fake_quant_act_static(self, act, args={}): return q_act + # support mix precision quant act def fake_quant_act_dynamic(self, act, args={}): if 'int_indices' in args: q_act = act[:, :, args['int_indices']] @@ -277,10 +302,14 @@ def fake_quant_act_dynamic(self, act, args={}): org_act_shape = q_act.shape org_act_dtype = q_act.dtype - q_act, scales, zeros, qmax, qmin = self.get_tensor_qparams( - q_act, args - ) - q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin) + if not self.use_fp: + q_act, scales, zeros, max_int, min_int = self.get_tensor_qparams( + q_act, args + ) + q_act = self.quant_dequant(q_act, scales, zeros, max_int, min_int) + else: + q_act, scales = self.get_fp_tensor_qparams(q_act, args) + q_act = self.fp_quant_dequant(q_act, scales) q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype) @@ -309,14 +338,14 @@ def fake_quant_weight_static(self, weight, args): org_w_shape = q_weight.shape org_w_dtype = q_weight.dtype - scales, zeros, qmax, qmin = ( + scales, zeros, max_int, min_int = ( args['scales'], args['zeros'], - args['qmax'], - args['qmin'], + args['max_int'], + args['min_int'], ) q_weight = self.reshape_tensor(q_weight) - q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin) + q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int) q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype) if 'int_indices' in args: @@ -330,6 +359,7 @@ def fake_quant_weight_static(self, weight, args): return q_weight + # support mix precision quant weight def fake_quant_weight_dynamic(self, weight, args={}): if 'int_indices' in args: if self.granularity == 'per_group': @@ -348,11 +378,15 @@ def fake_quant_weight_dynamic(self, weight, args={}): org_w_shape = q_weight.shape org_w_dtype = q_weight.dtype + if not self.use_fp: + q_weight, scales, zeros, max_int, min_int = self.get_tensor_qparams( + q_weight, args + ) - q_weight, scales, zeros, qmax, qmin = self.get_tensor_qparams( - q_weight, args - ) - q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin) + q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int) + else: + q_weight, scales = self.get_fp_tensor_qparams(q_weight, args) + q_weight = self.fp_quant_dequant(q_weight, scales) q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype) @@ -372,282 +406,48 @@ def fake_quant_weight_dynamic(self, weight, args={}): def real_quant_weight_static(self, weight, args): org_w_shape = weight.shape - scales, zeros, qmax, qmin = ( + scales, zeros, max_int, min_int = ( args['scales'], args['zeros'], - args['qmax'], - args['qmin'], + args['max_int'], + args['min_int'], ) weight = self.reshape_tensor(weight) - weight = self.quant(weight, scales, zeros, qmax, qmin) + weight = self.quant(weight, scales, zeros, max_int, min_int) weight = self.restore_tensor(weight, org_w_shape) if self.bit == 8: - if self.qmin != 0: + if self.sym: dtype = torch.int8 else: dtype = torch.uint8 else: dtype = torch.int32 weight = weight.to(dtype) - if not self.sym and self.round_zp: + if zeros != torch.tensor(0.0) and self.round_zp: zeros = zeros.to(dtype) - elif self.sym: + else: zeros = None - if zeros is not None: - zeros = zeros.view(weight.shape[0], -1) - scales = scales.view(weight.shape[0], -1) - return weight, scales, zeros def real_quant_weight_dynamic(self, weight, args={}): org_w_shape = weight.shape - weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(weight, args) - weight = self.quant(weight, scales, zeros, qmax, qmin) + weight, scales, zeros, max_int, min_int = self.get_tensor_qparams(weight, args) + weight = self.quant(weight, scales, zeros, max_int, min_int) weight = self.restore_tensor(weight, org_w_shape) if self.bit == 8: - if self.qmin != 0: + if self.sym: dtype = torch.int8 else: dtype = torch.uint8 else: dtype = torch.int32 weight = weight.to(dtype) - if not self.sym and self.round_zp: + if zeros != torch.tensor(0.0) and self.round_zp: zeros = zeros.to(dtype) - elif self.sym: - zeros = None - - if zeros is not None: - zeros = zeros.view(weight.shape[0], -1) - scales = scales.view(weight.shape[0], -1) - - return weight, scales, zeros - - def __repr__(self): - return ( - f'IntegerQuantizer(bit={self.bit}, sym={self.sym},' - f'granularity={self.granularity},' - f'kwargs={self.kwargs}, qmin={self.qmin}, qmax={self.qmax})' - ) - - -class FloatQuantizer(BaseQuantizer): - def __init__(self, bit, symmetric, granularity, **kwargs): - super().__init__(bit, symmetric, granularity, **kwargs) - self.sym = True - self.quant_type = 'float-quant' - self.e_bits = int(self.bit[1]) - self.m_bits = int(self.bit[-1]) - self.sign_bits = 1 - self.num_bits = self.e_bits + self.m_bits + self.sign_bits - self.default_bias = 2 ** (self.e_bits - 1) - - self.use_qtorch = self.kwargs.get('use_qtorch') - if self.use_qtorch: - try: - from qtorch.quant import float_quantize - except ImportError: - logger.error('qtorch not found, please install qtorch.') - raise ImportError('Please install qtorch (pip install qtorch).') - - self.float_quantize = float_quantize - - if 'float_range' in self.kwargs: - self.qmin, self.qmax = self.kwargs['float_range'] - else: - bit_ranges = { - ('e4m3', 8): torch.float8_e4m3fn, - ('e5m2', 8): torch.float8_e5m2, - ('e3m2', 6): (-28, 28), - ('e4m7', 12): (-510, 510), - ('e2m1', 4): (-6, 6), - } - - key = (self.bit, self.num_bits) - if key in bit_ranges: - if isinstance(bit_ranges[key], tuple): - self.qmin, self.qmax = bit_ranges[key] - else: - finfo = torch.finfo(bit_ranges[key]) - self.qmin, self.qmax = finfo.min, finfo.max - else: - raise NotImplementedError('Only 4, 6, 8, and \ - 12-bit quantization is supported.') - - def get_float_qparams(self, tensor, tensor_range, device): - min_val, max_val = tensor_range[0], tensor_range[1] - maxval = torch.max(max_val, -min_val) - - e_bits = torch.tensor(self.e_bits, dtype=torch.float32).cuda() - m_bits = torch.tensor(self.m_bits, dtype=torch.float32).cuda() - - if maxval.shape[0] != 1 and len(maxval.shape) != len(tensor.shape): - maxval = maxval.view([-1] + [1] * (len(tensor.shape) - 1)) - - if e_bits >= 5: - maxval = maxval.to(dtype=torch.float32) - - bias = 2**e_bits - torch.log2(maxval) + torch.log2(2 - 2 ** (-m_bits)) - 1 - - xc = torch.min(torch.max(tensor, -maxval), maxval) - - log_scales = torch.clamp( - (torch.floor(torch.log2(torch.abs(xc)) + bias)).detach(), 1.0 - ) - scales = 2.0 ** (log_scales - m_bits - bias) - - return xc, scales - - def get_tensor_qparams(self, tensor, args={}): - tensor = self.reshape_tensor(tensor) - tensor_range = self.get_tensor_range(tensor, args) - if self.use_qtorch: - scales, zeros, qmax, qmin = self.get_qparams(tensor_range, tensor.device) else: - tensor, scales = self.get_float_qparams(tensor, tensor_range, tensor.device) - zeros, qmin, qmax = torch.tensor(0), None, None - - return tensor, scales, zeros, qmax, qmin - - def quant(self, tensor, scales, zeros, qmax, qmin): - scales[scales == 0] = 1 - scaled_tensor = tensor / scales + zeros - if self.use_qtorch: - org_dtype = scaled_tensor.dtype - q_tensor = self.float_quantize(scaled_tensor.float(), - self.e_bits, - self.m_bits, - rounding='nearest') - q_tensor.to(org_dtype) - else: - q_tensor = self.round_func(scaled_tensor) - return q_tensor - - def dequant(self, tensor, scales, zeros): - tensor = (tensor - zeros) * scales - return tensor - - def quant_dequant(self, tensor, scales, zeros, qmax, qmin): - tensor = self.quant(tensor, scales, zeros, qmax, qmin) - tensor = self.dequant(tensor, scales, zeros) - return tensor - - def fake_quant_act_static(self, act, args={}): - q_act = act - org_act_shape = q_act.shape - org_act_dtype = q_act.dtype - - scales, zeros, qmax, qmin = ( - args['scales'], - args['zeros'], - args['qmax'], - args['qmin'], - ) - q_act = self.reshape_tensor(q_act) - q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin) - q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype) - - return q_act - - def fake_quant_act_dynamic(self, act, args={}): - q_act = act - org_act_shape = q_act.shape - org_act_dtype = q_act.dtype - - q_act, scales, zeros, qmax, qmin = self.get_tensor_qparams( - q_act, args - ) - q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin) - - q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype) - return q_act - - def fake_quant_weight_static(self, weight, args): - - if 'dim' in args and 'ic' in args['dim']: - q_weight = weight.T - else: - q_weight = weight - - org_w_shape = q_weight.shape - org_w_dtype = q_weight.dtype - scales, zeros, qmax, qmin = ( - args['scales'], - args['zeros'], - args['qmax'], - args['qmin'], - ) - q_weight = self.reshape_tensor(q_weight) - q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin) - q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype) - - if 'dim' in args and 'ic' in args['dim']: - q_weight = q_weight.T - - return q_weight - - def fake_quant_weight_dynamic(self, weight, args={}): - - if 'dim' in args and 'ic' in args['dim']: - q_weight = weight.T - else: - q_weight = weight - - org_w_shape = q_weight.shape - org_w_dtype = q_weight.dtype - - q_weight, scales, zeros, qmax, qmin = self.get_tensor_qparams( - q_weight, args - ) - q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin) - q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype) - - if 'dim' in args and 'ic' in args['dim']: - q_weight = q_weight.T - - return q_weight - - def real_quant_weight_static(self, weight, args): - assert self.bit in ['e4m3', 'e5m2'], 'Only FP8 E4M3 and E5M2 support real quant' - dtype = torch.float8_e4m3fn if self.e_bits == 4 else torch.float8_e5m2 - - org_w_shape = weight.shape - scales, zeros, qmax, qmin = ( - args['scales'], - args['zeros'], - args['qmax'], - args['qmin'], - ) - weight = self.reshape_tensor(weight) - weight = self.quant(weight, scales, zeros, qmax, qmin) - weight = self.restore_tensor(weight, org_w_shape) - - weight = weight.to(dtype) - zeros = None - scales = scales.view(weight.shape[0], -1) - return weight, scales, zeros - - def real_quant_weight_dynamic(self, weight, args={}): - assert self.bit in ['e4m3', 'e5m2'], 'Only FP8 E4M3 and E5M2 support real quant' - dtype = torch.float8_e4m3fn if self.e_bits == 4 else torch.float8_e5m2 - - org_w_shape = weight.shape - weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(weight, args) - weight = self.quant(weight, scales, zeros, qmax, qmin) - weight = self.restore_tensor(weight, org_w_shape) + zeros = None - weight = weight.to(dtype) - zeros = None - scales = scales.view(weight.shape[0], -1) return weight, scales, zeros - - def __repr__(self): - return ( - f'FloatQuantizer(bit={self.bit},' - f'e_bits={self.e_bits}, m_bits={self.m_bits},' - f'granularity={self.granularity},' - f'kwargs={self.kwargs}, qmin={self.qmin}, qmax={self.qmax})' - ) diff --git a/llmc/compression/quantization/quarot.py b/llmc/compression/quantization/quarot.py index 8e6c1d2df..fb4baaafd 100644 --- a/llmc/compression/quantization/quarot.py +++ b/llmc/compression/quantization/quarot.py @@ -1,6 +1,4 @@ import gc -import json -import os import torch import torch.nn as nn @@ -16,22 +14,15 @@ @ALGO_REGISTRY class Quarot(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.dev = torch.device('cuda') self.add_quant_config() self.preprocess() def preprocess(self): - if torch.equal( - self.model.get_head_layers()[0].weight, - self.model.get_embed_layers()[0].weight, - ): - logger.info('Tie weight! Copy embed_layer for head_layer!') - del self.model.get_head_layers()[0].weight - w = self.model.get_embed_layers()[0].weight.clone() - self.model.get_head_layers()[0].weight = nn.Parameter(w) - + assert self.config['model']['type'] in ['Opt', 'Llama'] + # if self.config["model"]["type"] in ["Opt"]: self.remove_mean_from_embed() self.Q = self.get_orthogonal_matrix() @@ -49,31 +40,30 @@ def preprocess(self): ) self.rotate_head(self.Q) + gc.collect() torch.cuda.empty_cache() + def a_rot(self, act, module, a_rotater): + return a_rotater.rotate(act) + @torch.no_grad() def add_quant_config(self): self.rotate_mode = self.quant_config['special']['rotate_mode'] def get_orthogonal_matrix(self): if self.rotate_mode == 'random': - try: - return random_orthogonal_matrix(self.hidden_size, self.dev) - except NameError: - raise RuntimeError( - 'Function random_orthogonal_matrix is not defined.' - ) + return random_orthogonal_matrix(self.hidden_size, self.dev) elif self.rotate_mode == 'hadamard': return random_hadamard_matrix(self.hidden_size, self.dev) else: raise ValueError(f'Unsupported mode {self.mode}') - def block_transform(self, block): + def block_transform(self, block, ): logger.info(f'Start transform the {self.block_idx+1}-th block') if self.online_rotate: - self.replace_rotate_linears(block) + self.replace_rotate_fcs(block) subsets = self.model.get_subsets_in_block(block) for index, subset in enumerate(subsets): self.subset_transform(block, subset) @@ -97,31 +87,18 @@ def subset_transform(self, block, subset): self.fuse_ln_fcs(prev_op[0], layers) self.rotate_pre_layers(layers, self.Q) else: - if self.config['model']['type'] in ['Opt', 'StableLm']: - self.bake_mean_into_fc(layers[0]) + if self.config['model']['type'] in ['Opt']: + self.bake_mean_into_linear(layers[0]) if 'is_mlp' in subset and subset['is_mlp']: self.rotate_post_layers( layers, self.Q, exact_had=True if self.online_rotate else False ) else: - for n, m in layers_dict.items(): - logger.info(f'layer: {n} {m.weight.shape}') - logger.info(f'{self.Q.shape}') self.rotate_post_layers(layers, self.Q, exact_had=False) if self.online_rotate: + R2 = None apply_exact_had_to_linear( - prev_op[0], had_dim=self.head_dim, output=True + prev_op[0], had_dim=self.head_dim, output=True, R2=R2 ) - apply_exact_had_to_linear(layers[0], had_dim=-1, output=False) - - @torch.no_grad() - def save_model(self, path): - super().save_model(path) - path = os.path.join(path, 'config.json') - with open(path, 'r') as f: - config = json.load(f) - if 'tie_word_embeddings' in config: - config['tie_word_embeddings'] = False - with open(path, 'w') as f: - json.dump(config, f, indent=4) + apply_exact_had_to_linear(layers[0], had_dim=-1, output=False, R2=R2) diff --git a/llmc/compression/quantization/quik.py b/llmc/compression/quantization/quik.py index 3a1e0441b..f57bf9576 100644 --- a/llmc/compression/quantization/quik.py +++ b/llmc/compression/quantization/quik.py @@ -12,8 +12,8 @@ @ALGO_REGISTRY class QUIK(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) self.add_quant_config() def add_quant_config(self): diff --git a/llmc/compression/quantization/rotate_utils.py b/llmc/compression/quantization/rotate_utils.py new file mode 100644 index 000000000..1ed935701 --- /dev/null +++ b/llmc/compression/quantization/rotate_utils.py @@ -0,0 +1,102 @@ +import math + +import torch +import torch.nn as nn +from loguru import logger + +from .hadamard_utils import HadamardTransform, matmul_hadU_cuda + + +class RotateModule(nn.Module): + def __init__(self, Q_init): + super(RotateModule, self).__init__() + self.weight = nn.Parameter(Q_init.to(torch.float32).to(torch.device('cuda'))) + + def forward(self, x, transpose=False): + if transpose: + return x @ self.weight + else: + return self.weight @ x + + +class WeightRotater: + def __init__(self, weight_rotate_func, dev): + self.rotate_func = weight_rotate_func + self.dev = dev + + def rotate(self, weight, bias, Q1, Q2, transpose): + + if Q1 is not None: + tmp_weight, tmp_bias = self.rotate_func(weight, bias, Q1.weight, transpose) + + if Q2 is not None: + had_dim = Q2.weight.shape[0] + dtype = tmp_weight.dtype + if transpose: + init_shape = tmp_weight.shape + tmp_weight = tmp_weight.reshape(-1, init_shape[-1] // had_dim, had_dim) + tmp_weight, _ = self.rotate_func(tmp_weight, bias, Q2.weight, False) + tmp_weight = tmp_weight.reshape(init_shape) + else: + tmp_weight = tmp_weight.t() + transposed_shape = tmp_weight.shape + tmp_weight = tmp_weight.reshape(-1, transposed_shape[-1] // had_dim, had_dim) + tmp_weight, _ = self.rotate_func(tmp_weight, bias, Q2.weight, False) + tmp_weight = tmp_weight.reshape(transposed_shape).t() + + if Q1 is None and Q2 is None: + tmp_weight = weight + tmp_bias = bias + + tmp_weight = tmp_weight.to(self.dev) + tmp_bias = tmp_bias.to(self.dev) if tmp_bias is not None else None + + return tmp_weight, tmp_bias + + +class ActRotater: + def __init__( + self, online_full_had, online_partial_had, fp32_had, K, had_K=None, had_dim=None + ): + self.online_full_had = online_full_had + self.online_partial_had = online_partial_had + self.fp32_had = fp32_had + self.K = K + self.had_K = had_K + self.had_dim = had_dim + + def rotate(self, x): + x_dtype = x.dtype + + if self.online_full_had: + if self.fp32_had: + x = matmul_hadU_cuda(x.float(), self.had_K, self.K).to(x_dtype) + else: + x = matmul_hadU_cuda(x, self.had_K, self.K) + + elif self.online_partial_had: + if self.fp32_had: + x = x.float() + init_shape = x.shape + if self.K == 1: + x = ( + HadamardTransform.apply( + x.reshape( + -1, init_shape[-1] // self.had_dim, self.had_dim + ).transpose(1, 2) + ) + / math.sqrt(init_shape[-1] // self.had_dim) + ).transpose(1, 2) + else: + self.had_K = self.had_K.to(x.device) + + x = ( + self.had_K.to(x.dtype) + @ x.reshape(-1, init_shape[-1] // self.had_dim, self.had_dim) + ) / math.sqrt(init_shape[-1] // self.had_dim) + + if self.fp32_had: + x = x.to(x_dtype) + x = x.reshape(init_shape) + + return x diff --git a/llmc/compression/quantization/rtn.py b/llmc/compression/quantization/rtn.py index aba208510..609b1c51d 100644 --- a/llmc/compression/quantization/rtn.py +++ b/llmc/compression/quantization/rtn.py @@ -8,8 +8,8 @@ @ALGO_REGISTRY class RTN(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) if quant_config.get('act', False) and quant_config['act'].get('static', False): logger.info('Activation quant is static. Calibration is required.') self.act_static = True @@ -18,8 +18,7 @@ def __init__(self, model, quant_config, input, padding_mask, config): @torch.no_grad() def block_opt(self, *opt_kwargs): - if self.act_static: - super().block_opt(*opt_kwargs) + pass def a_qdq(self, act, module, aquantizer): if self.act_static: @@ -30,11 +29,11 @@ def a_qdq(self, act, module, aquantizer): args['zeros'] = ( module.buf_act_zeros if hasattr(module, 'buf_act_zeros') else None ) - args['qmax'] = ( - module.buf_act_qmax if hasattr(module, 'buf_act_qmax') else None + args['max_int'] = ( + module.buf_act_max_int if hasattr(module, 'buf_act_max_int') else None ) - args['qmin'] = ( - module.buf_act_qmin if hasattr(module, 'buf_act_qmin') else None + args['min_int'] = ( + module.buf_act_min_int if hasattr(module, 'buf_act_min_int') else None ) return aquantizer.fake_quant_act_static(act, args) else: @@ -60,18 +59,18 @@ def get_act_qparams(self, layers_dict, act_tensors): avg_max_val = max_val / len(act_tensors) else: avg_max_val += max_val / len(act_tensors) - scales, zeros, qmax, qmin = self.aquantizer.get_qparams( + scales, zeros, max_int, min_int = self.aquantizer.get_qparams( (avg_min_val, avg_max_val), act_tensors[0].device ) for name in layers_dict: layers_dict[name].register_buffer('buf_act_scales', scales) layers_dict[name].register_buffer('buf_act_zeros', zeros) - layers_dict[name].register_buffer('buf_act_qmax', qmax) - layers_dict[name].register_buffer('buf_act_qmin', qmin) + layers_dict[name].register_buffer('buf_act_max_int', max_int) + layers_dict[name].register_buffer('buf_act_min_int', min_int) logger.info(f'{name} act_scales : {scales}') logger.info(f'{name} act_zeros : {zeros}') - logger.info(f'{name} act_qmax : {qmax}') - logger.info(f'{name} act_qmin : {qmin}') + logger.info(f'{name} act_max_int : {max_int}') + logger.info(f'{name} act_min_int : {min_int}') @torch.no_grad() def subset_transform( diff --git a/llmc/compression/quantization/smoothquant.py b/llmc/compression/quantization/smoothquant.py index 706edc80d..2ba1905cd 100644 --- a/llmc/compression/quantization/smoothquant.py +++ b/llmc/compression/quantization/smoothquant.py @@ -11,10 +11,8 @@ @ALGO_REGISTRY class SmoothQuant(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) - special_config = self.quant_config.get('special', {}) - self.alpha = special_config.get('alpha', 0.5) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) @torch.no_grad() def filter_subset(self, subset): @@ -55,7 +53,7 @@ def search_scale_subset(self, layers, tensors): w_max = self.get_weight_scale(layers) x_max = self.get_act_scale(tensors) x_max = x_max.to(dtype=w_max.dtype, device=w_max.device) - scale = (x_max.pow(self.alpha) / w_max.pow(self.alpha)).clamp(min=1e-5) + scale = (x_max.pow(0.5) / w_max.pow(0.5)).clamp(min=1e-5) return scale @torch.no_grad() diff --git a/llmc/compression/quantization/spinquant.py b/llmc/compression/quantization/spinquant.py new file mode 100644 index 000000000..82a1df238 --- /dev/null +++ b/llmc/compression/quantization/spinquant.py @@ -0,0 +1,231 @@ +import gc +from functools import partial + +import torch +import torch.nn as nn +from loguru import logger + +from llmc.utils.registry_factory import ALGO_REGISTRY + +from .base_blockwise_quantization import BaseBlockwiseQuantization +from .hadamard_utils import apply_exact_had_to_linear, random_hadamard_matrix +from .module_utils import * +from .module_utils import (_LLMC_LN_TYPES_, _TRANSFORMERS_LN_TYPES_, + EffcientFakeQuantLinear, FakeQuantLinear, + LlmcRMSNorm, OriginEmbedding, OriginFloatLinear, + RotateEmbedding, RotateLinear) +from .rotate_utils import ActRotater, RotateModule, WeightRotater + + +@ALGO_REGISTRY +class SpinQuant(BaseBlockwiseQuantization): + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) + self.dev = torch.device('cuda') + self.add_quant_config() + self.preprocess() + + def add_quant_config(self): + self.rotate_mode = self.quant_config['special']['rotate_mode'] + self.weight_rotate = True + self.w_rotater = WeightRotater(weight_rotate_func=self.rotate_weight, dev=self.dev) + # self.o_proj_group_quant = self.quant_config['special']['o_proj_group_quant'] + + def preprocess(self): + for m in self.model.model.parameters(): + m.requires_grad = False + + assert self.config['model']['type'] in ['Opt', 'Llama'] + # if self.config["model"]["type"] in ["Opt"]: + self.remove_mean_from_embed() + + Q1 = self.get_orthogonal_matrix(self.hidden_size) + self.model.model.Q1 = RotateModule(Q1) + + self.register_embed_spin_parameters() + + pre_head_ln = self.model.get_pre_head_layernorm_layers()[0] + self.fuse_ln_fcs(pre_head_ln, self.model.get_head_layers()) + + self.model.replace_module_subset( + LlmcRMSNorm, + self.model.model, + {'layers': {'model.norm': pre_head_ln}}, + None, + {}, + ) + self.register_lmhead_spin_parameters() + + gc.collect() + torch.cuda.empty_cache() + + def get_trainable_params(self): + trainable_parameters = [] + for n, m in self.model.model.named_parameters(): + if 'Q1' in n or 'Q2' in n: + trainable_parameters.append(m) + return trainable_parameters + + def a_rot(self, act, module, a_rotater): + return a_rotater.rotate(act) + + def w_rot(self, module, w_rotater, args): + return w_rotater.rotate(module.weight, module.bias, args['Q1'], args['Q2'], args['transpose']) + + def w_qdq_tmp(self, module, wquantizer): + args = {'lowbound_factor': None, 'upbound_factor': None} + if hasattr(module, 'buf_lowbound_factor'): + args['lowbound_factor'] = module.buf_lowbound_factor + if hasattr(module, 'buf_upbound_factor'): + args['upbound_factor'] = module.buf_upbound_factor + + return wquantizer.fake_quant_weight_dynamic(module.tmp_weight, args) + + def register_embed_spin_parameters(self): + embedding_layer = self.model.get_embed_layers()[0] + args = {} + args['Q1'] = self.model.model.Q1 + args['Q2'] = None + args['transpose'] = False + params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=None, args=args) + params_dict.pop('a_rot') + self.model.replace_module_subset( + RotateEmbedding, + self.model.model, + {'layers': {'model.embed_tokens': embedding_layer}}, + None, + params_dict + ) + self.model.find_embed_layers() + + def register_lmhead_spin_parameters(self): + lm_head_layer = self.model.get_head_layers()[0] + args = {} + args['Q1'] = self.model.model.Q1 + args['Q2'] = None + args['transpose'] = False + params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=None, args=args) + self.model.replace_module_subset( + RotateLinear, + self.model.model, + {'layers': {'lm_head': lm_head_layer}}, + None, + params_dict + ) + + def apply_fc_rotate_weight(self): + for idx, block in enumerate(self.blocks): + logger.info(f'Start apply {idx}-th block rotate weights') + for name, module in block.named_modules(): + if isinstance(module, (RotateLinear, FakeQuantLinear)): + weight, bias = module._rotate_weight() + module.weight, module.bias = weight, bias + logger.info(f'End apply {idx}-th block rotate weights') + + def apply_embedding_rotate_weight(self): + + embedding_layer = self.model.get_embed_layers()[0] + if isinstance(embedding_layer, RotateEmbedding): + weight = embedding_layer._rotate_weight() + embedding_layer.weight.data = weight + self.model.replace_module_subset( + OriginEmbedding, + self.model.model, + {'layers': {'model.embed_tokens': embedding_layer}}, + None, + {} + ) + + def apply_lmhead_rotate_weight(self): + lm_head_layer = self.model.get_head_layers()[0] + if isinstance(lm_head_layer, RotateLinear): + weight, bias = lm_head_layer._rotate_weight() + lm_head_layer.weight, lm_head_layer.bias = weight, bias + self.model.replace_module_subset( + OriginFloatLinear, + self.model.model, + {'layers': {'lm_head': lm_head_layer}}, + None, + {} + ) + + + def get_orthogonal_matrix(self, size): + if self.rotate_mode == 'random': + return random_orthogonal_matrix(size, self.dev) + elif self.rotate_mode == 'hadamard': + return random_hadamard_matrix(size, self.dev) + else: + raise ValueError(f'Unsupported mode {self.mode}') + + def block_transform(self, block): + logger.info(f'Start transform the {self.block_idx+1}-th block') + + subsets = self.model.get_subsets_in_block(block) + for index, subset in enumerate(subsets): + self.subset_transform(block, subset) + + self.model.replace_module_block(LlmcRMSNorm, block, self.block_idx, {}) + + logger.info(f'block:{block}') + logger.info(f'End transform the {self.block_idx+1}-th block') + + def subset_transform(self, block, subset): + prev_op = subset['prev_op'] + layers_dict = subset['layers'] + assert ( + len(prev_op) == 1 + ), 'Only support single prev_op. If multi prev_ops, code need to be updated.' + + layers = list(layers_dict.values()) + + if isinstance(prev_op[0], tuple(_LLMC_LN_TYPES_ + _TRANSFORMERS_LN_TYPES_)): + self.fuse_ln_fcs(prev_op[0], layers) + for n in layers_dict.keys(): + m = layers_dict[n] + self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=None, transpose=False) + if 'is_mlp' not in subset or not subset['is_mlp']: + Q2 = self.get_orthogonal_matrix(self.hidden_size // self.num_heads) + subset['inspect'].Q2 = RotateModule(Q2) + + else: + if self.config['model']['type'] in ['Opt']: + self.bake_mean_into_linear(layers[0]) + + n = list(layers_dict.keys())[0] + m = layers[0] + if 'is_mlp' in subset and subset['is_mlp']: + if self.online_rotate: + apply_exact_had_to_linear(m, had_dim=-1, output=False) + self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=None, transpose=True) + else: + self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=block.self_attn.Q2, transpose=True) + self.replace_rotate_fc(block, 'self_attn.v_proj', prev_op[0], Q1=self.model.model.Q1, Q2=block.self_attn.Q2, transpose=False) + + def apply_rotate_weight(self): + self.apply_embedding_rotate_weight() + self.apply_lmhead_rotate_weight() + self.apply_fc_rotate_weight() + + def deploy(self, quant_format): + if quant_format == 'train_rotate_quant': + logger.info(f'-- deploy_{quant_format}_model start --') + logger.info(f'quant_config : {self.quant_config}') + logger.info(self.model.model) + + params_dict = {} + params_dict['w_qdq'] = partial(self.w_qdq_tmp, wquantizer=self.wquantizer) + params_dict['a_qdq'] = ( + partial(self.a_qdq, aquantizer=self.aquantizer) + if not self.w_only + else None + ) + self.model.replace_module_all( + FakeQuantLinear, params_dict + ) + + logger.info(f'-- deploy_{quant_format}_model done --') + logger.info(f'-- strat train rotation--') + else: + self.apply_rotate_weight() + super().deploy(quant_format) diff --git a/llmc/compression/quantization/spqr.py b/llmc/compression/quantization/spqr.py index 51ee90742..559385669 100644 --- a/llmc/compression/quantization/spqr.py +++ b/llmc/compression/quantization/spqr.py @@ -13,13 +13,13 @@ from .base_blockwise_quantization import BaseBlockwiseQuantization from .module_utils import FakeQuantLinear -from .quant import IntegerQuantizer +from .quant import Quantizer @ALGO_REGISTRY class SpQR(BaseBlockwiseQuantization): - def __init__(self, model, quant_config, input, padding_mask, config): - super().__init__(model, quant_config, input, padding_mask, config) + def __init__(self, model, quant_config, input, config): + super().__init__(model, quant_config, input, config) assert ( self.wquantizer.granularity == 'per_group' ), 'SpQR only supports per_group quantization' @@ -50,11 +50,9 @@ def add_quant_config(self): scale_config = special_config['scale'] zero_config = special_config['zero'] - self.quant_type = self.quant_config.get('quant_type', 'int_quant') - assert self.quant_type != 'float_quant', 'SPQR do not support Float quant now.' - self.scale_quantizer = IntegerQuantizer(**scale_config) - self.zero_quantizer = IntegerQuantizer(**zero_config) - self.Q = IntegerQuantizer( + self.scale_quantizer = Quantizer(**scale_config) + self.zero_quantizer = Quantizer(**zero_config) + self.Q = Quantizer( self.wquantizer.bit, self.wquantizer.sym, 'per_channel', round_zp=False ) @@ -232,8 +230,8 @@ def outliers(G, HinvGD): W[:, i].unsqueeze(1), self.qparams['scales'], self.qparams['zeros'], - self.qparams['qmax'], - self.qparams['qmin'], + self.qparams['max_int'], + self.qparams['min_int'], ).squeeze(1) err = (W[:, i] - q) / Hinv[i, i] @@ -323,24 +321,24 @@ def merge_qparams(self, qparams): def get_group_qparams(self, c_tensor, idx): """get qparams for a group, idx is the index of a column within a group, c_tensor is a group.""" - _, s, z, qmax, qmin = self.wquantizer.get_tensor_qparams(c_tensor) + _, s, z, max_int, min_int = self.wquantizer.get_tensor_qparams(c_tensor) _, ss, zs, Ps, Ns = self.scale_quantizer.get_tensor_qparams(s) args = {} args['scales'] = ss args['zeros'] = zs - args['qmin'] = Ns - args['qmax'] = Ps + args['min_int'] = Ns + args['max_int'] = Ps scales = self.scale_quantizer.fake_quant_weight_static(s.data, args) _, sz, zz, Pz, Nz = self.zero_quantizer.get_tensor_qparams(z) args['scales'] = sz args['zeros'] = zz - args['qmin'] = Nz - args['qmax'] = Pz + args['min_int'] = Nz + args['max_int'] = Pz zeros = self.zero_quantizer.fake_quant_weight_static(z.data, args) self.qparams['scales'] = scales self.qparams['zeros'] = zeros - self.qparams['qmax'] = qmax - self.qparams['qmin'] = qmin + self.qparams['max_int'] = max_int + self.qparams['min_int'] = min_int qparams = copy.deepcopy(self.qparams) self.groups[idx // self.wquantizer.group_size] = qparams @@ -351,8 +349,8 @@ def set_model_qparams(self, layer): d['zeros'] = self.merge_qparams([g['zeros'] for g in self.groups]) for k, v in d.items(): layer.register_buffer('buf_' + k, copy.deepcopy(v)) - layer.register_buffer('buf_qmax', torch.tensor(self.groups[0]['qmax'])) - layer.register_buffer('buf_qmin', torch.tensor(self.groups[0]['qmin'])) + layer.register_buffer('buf_max_int', torch.tensor(self.groups[0]['max_int'])) + layer.register_buffer('buf_min_int', torch.tensor(self.groups[0]['min_int'])) @torch.no_grad() def free(self, name): @@ -375,8 +373,8 @@ def w_qdq(self, module, wquantizer): args = {} args['scales'] = module.buf_scales args['zeros'] = module.buf_zeros - args['qmax'] = module.buf_qmax - args['qmin'] = module.buf_qmin + args['max_int'] = module.buf_max_int + args['min_int'] = module.buf_min_int weight = wquantizer.fake_quant_weight_static(weight, args).to(self.model_dtype) diff --git a/llmc/compression/quantization/train_utils.py b/llmc/compression/quantization/train_utils.py index 1f941f1a8..2c69733d5 100644 --- a/llmc/compression/quantization/train_utils.py +++ b/llmc/compression/quantization/train_utils.py @@ -1,11 +1,15 @@ import os +import random import sys import time +from dataclasses import dataclass, field from math import inf import torch import torch.nn as nn +import transformers from loguru import logger +from torch.optim.optimizer import Optimizer class TruncateFunction(torch.autograd.Function): @@ -105,3 +109,186 @@ def ampscaler_get_grad_norm(self, parameters, norm_type=2.0): norm_type, ) return total_norm + + +def unit(v, dim: int = 1, eps: float = 1e-8): + vnorm = norm(v, dim) + return v / vnorm.add(eps), vnorm + + +def norm(v, dim: int = 1): + assert len(v.size()) == 2 + return v.norm(p=2, dim=dim, keepdim=True) + + +def matrix_norm_one(W): + out = torch.abs(W) + out = torch.sum(out, dim=0) + out = torch.max(out) + return out + + +def Cayley_loop(X, W, tan_vec, t): # + [n, p] = X.size() + Y = X + t * tan_vec + for i in range(5): + Y = X + t * torch.matmul(W, 0.5 * (X + Y)) + + return Y.t() + + +def qr_retraction(tan_vec): # tan_vec, p-by-n, p <= n + [p, n] = tan_vec.size() + tan_vec.t_() + q, r = torch.linalg.qr(tan_vec) + d = torch.diag(r, 0) + ph = d.sign() + q *= ph.expand_as(q) + q.t_() + + return q + + +class SGDG(Optimizer): + r"""This optimizer updates variables with two different routines + based on the boolean variable 'stiefel'. + + If stiefel is True, the variables will be updated by SGD-G proposed + as decorrelated weight matrix. + + If stiefel is False, the variables will be updated by SGD. + This routine was taken from https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + + -- common parameters + lr (float): learning rate + momentum (float, optional): momentum factor (default: 0) + stiefel (bool, optional): whether to use SGD-G (default: False) + + -- parameters in case stiefel is False + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + dampening (float, optional): dampening for momentum (default: 0) + nesterov (bool, optional): enables Nesterov momentum (default: False) + + -- parameters in case stiefel is True + omega (float, optional): orthogonality regularization factor (default: 0) + grad_clip (float, optional): threshold for gradient norm clipping (default: None) + """ + + def __init__( + self, + params, + lr, + momentum: int = 0, + dampening: int = 0, + weight_decay: int = 0, + nesterov: bool = False, + stiefel: bool = False, + omega: int = 0, + grad_clip=None, + ) -> None: + defaults = dict( + lr=lr, + momentum=momentum, + dampening=dampening, + weight_decay=weight_decay, + nesterov=nesterov, + stiefel=stiefel, + omega=0, + grad_clip=grad_clip, + ) + if nesterov and (momentum <= 0 or dampening != 0): + raise ValueError('Nesterov momentum requires a momentum and zero dampening') + super(SGDG, self).__init__(params, defaults) + + def __setstate__(self, state) -> None: + super(SGDG, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('nesterov', False) + + def step(self, closure=None, episilon = 1e-8): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + momentum = group['momentum'] + stiefel = group['stiefel'] + + for p in group['params']: + if p.grad is None: + continue + + unity, _ = unit(p.data.view(p.size()[0], -1)) + if stiefel and unity.size()[0] <= unity.size()[1]: + weight_decay = group['weight_decay'] + dampening = group['dampening'] + nesterov = group['nesterov'] + + rand_num = random.randint(1, 101) + if rand_num == 1: + unity = qr_retraction(unity) + + g = p.grad.data.view(p.size()[0], -1) + + lr = group['lr'] + + param_state = self.state[p] + if 'momentum_buffer' not in param_state: + param_state['momentum_buffer'] = torch.zeros(g.t().size()) + if p.is_cuda: + param_state['momentum_buffer'] = param_state[ + 'momentum_buffer' + ].cuda() + + V = param_state['momentum_buffer'] + V = momentum * V - g.t() + MX = torch.mm(V, unity) + XMX = torch.mm(unity, MX) + XXMX = torch.mm(unity.t(), XMX) + W_hat = MX - 0.5 * XXMX + W = W_hat - W_hat.t() + t = 0.5 * 2 / (matrix_norm_one(W) + episilon) + alpha = min(t, lr) + + p_new = Cayley_loop(unity.t(), W, V, alpha) + V_new = torch.mm(W, unity.t()) # n-by-p + # check_identity(p_new.t()) + p.data.copy_(p_new.view(p.size())) + V.copy_(V_new) + + else: + d_p = p.grad.data + # defined. + try: + if weight_decay != 0: + # defined. + d_p.add_(weight_decay, p.data) + except: + pass + if momentum != 0: + param_state = self.state[p] + if 'momentum_buffer' not in param_state: + buf = param_state['momentum_buffer'] = d_p.clone() + else: + buf = param_state['momentum_buffer'] + # always defined. + buf.mul_(momentum).add_(1 - dampening, d_p) + # defined. + if nesterov: + d_p = d_p.add(momentum, buf) + else: + d_p = buf + + p.data.add_(-group['lr'], d_p) + + return loss diff --git a/llmc/compression/quantization/utils.py b/llmc/compression/quantization/utils.py index b7975249d..a85f20382 100644 --- a/llmc/compression/quantization/utils.py +++ b/llmc/compression/quantization/utils.py @@ -54,22 +54,3 @@ def check_w_only( ] return quantizer_mix_bits_this_layer['w_only_mix_bits'] return default_w_only - - -def make_divisible(c, divisor): - return (c + divisor - 1) // divisor - - -def calculate_zeros_width(in_features, group_size=128, pack_num=8): - if group_size >= 128: - size_multiplier = 1 - elif group_size == 64: - size_multiplier = 2 - elif group_size == 32: - size_multiplier = 4 - else: - raise NotImplementedError - - base_width = make_divisible(in_features // group_size, pack_num) - base_width = make_divisible(base_width, size_multiplier) * size_multiplier - return base_width diff --git a/llmc/compression/sparsification/base_blockwise_sparsification.py b/llmc/compression/sparsification/base_blockwise_sparsification.py index e62f5f272..607e244fa 100644 --- a/llmc/compression/sparsification/base_blockwise_sparsification.py +++ b/llmc/compression/sparsification/base_blockwise_sparsification.py @@ -12,8 +12,8 @@ class BaseBlockwiseSparsification(BlockwiseOpt): - def __init__(self, model, sparsity_config, input, padding_mask, config): - super().__init__(model, sparsity_config, input, padding_mask, config) + def __init__(self, model, sparsity_config, input, config): + super().__init__(model, sparsity_config, input, config) self.set_sparsity_config() def block_init(self, block): diff --git a/llmc/compression/sparsification/magnitude.py b/llmc/compression/sparsification/magnitude.py index 8f36b295d..57ad23a8b 100644 --- a/llmc/compression/sparsification/magnitude.py +++ b/llmc/compression/sparsification/magnitude.py @@ -8,8 +8,8 @@ @ALGO_REGISTRY class Magnitude(BaseBlockwiseSparsification): - def __init__(self, model, sparsity_config, input, padding_mask, config): - super().__init__(model, sparsity_config, input, padding_mask, config) + def __init__(self, model, sparsity_config, input, config): + super().__init__(model, sparsity_config, input, config) @torch.no_grad() def subset_transform( diff --git a/llmc/compression/sparsification/shortgpt.py b/llmc/compression/sparsification/shortgpt.py index c8c8dc410..64aadd9f9 100644 --- a/llmc/compression/sparsification/shortgpt.py +++ b/llmc/compression/sparsification/shortgpt.py @@ -17,8 +17,8 @@ @ALGO_REGISTRY class ShortGPT(BaseBlockwiseSparsification): - def __init__(self, model, sparsity_config, input, padding_mask, config): - super().__init__(model, sparsity_config, input, padding_mask, config) + def __init__(self, model, sparsity_config, input, config): + super().__init__(model, sparsity_config, input, config) def block_opt(self, block): block = block.cuda() diff --git a/llmc/compression/sparsification/wanda.py b/llmc/compression/sparsification/wanda.py index 951e58dab..1cdbc1e76 100644 --- a/llmc/compression/sparsification/wanda.py +++ b/llmc/compression/sparsification/wanda.py @@ -9,12 +9,12 @@ @ALGO_REGISTRY class Wanda(BaseBlockwiseSparsification): - def __init__(self, model, sparsity_config, input, padding_mask, config): - super().__init__(model, sparsity_config, input, padding_mask, config) + def __init__(self, model, sparsity_config, input, config): + super().__init__(model, sparsity_config, input, config) @torch.no_grad() def get_row_scale(self, layer, act): - if len(act.shape) == 2: + if len(act) == 2: act = act.unsqueeze(0) nsamples = act.shape[0] if isinstance(layer, nn.Linear): diff --git a/llmc/data/__init__.py b/llmc/data/__init__.py index 12ec02b05..fd0e40018 100644 --- a/llmc/data/__init__.py +++ b/llmc/data/__init__.py @@ -1,2 +1,2 @@ -from .dataset import BaseDataset +from .dataset import BaseDataset, TrainJsonDataset from .tokenizer import BaseTokenizer diff --git a/llmc/data/dataset/__init__.py b/llmc/data/dataset/__init__.py index b1933afee..bb5057b94 100644 --- a/llmc/data/dataset/__init__.py +++ b/llmc/data/dataset/__init__.py @@ -1 +1,2 @@ from .base_dataset import BaseDataset +from .train_dataset import TrainJsonDataset diff --git a/llmc/data/dataset/base_dataset.py b/llmc/data/dataset/base_dataset.py index 5e48c6ec4..8cfb36a15 100644 --- a/llmc/data/dataset/base_dataset.py +++ b/llmc/data/dataset/base_dataset.py @@ -1,34 +1,25 @@ -import json -import os from abc import ABCMeta import torch from datasets import load_dataset, load_from_disk from loguru import logger -from PIL import Image -from torch.nn import functional as F from .specified_preproc import PREPROC_REGISTRY class BaseDataset(metaclass=ABCMeta): - def __init__(self, tokenizer, calib_cfg, processor=None): + def __init__(self, tokenizer, calib_cfg): # calib_cfg logger.info(f'calib_cfg : {calib_cfg}') self.tokenizer = tokenizer - self.processor = processor self.calib_dataset_name = calib_cfg['name'] - self.calib_dataset_type = calib_cfg.get('type', 'txt') - self.padding = calib_cfg.get('padding', False) self.download = calib_cfg['download'] self.load_from_txt = calib_cfg.get('load_from_txt', False) self.calib_dataset_path = calib_cfg.get('path', None) self.n_samples = calib_cfg['n_samples'] self.calib_bs = calib_cfg['bs'] - self.seq_len = calib_cfg.get('seq_len', None) + self.seq_len = calib_cfg['seq_len'] self.preproc = calib_cfg['preproc'] - if self.preproc == 'original_txt': - assert self.seq_len is None self.seed = calib_cfg['seed'] self.dataset_key = { 'pileval': 'text', @@ -41,220 +32,61 @@ def __init__(self, tokenizer, calib_cfg, processor=None): self.build_calib_dataset() def build_calib_dataset(self): - if self.calib_dataset_type == 'txt': - if self.download: - if self.calib_dataset_name == 'pileval': - self.calib_dataset = load_dataset( - 'mit-han-lab/pile-val-backup', split='validation' - ) - elif self.calib_dataset_name == 'c4': - self.calib_dataset = load_dataset( - 'allenai/c4', - data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, - split='train', - ) - elif self.calib_dataset_name == 'wikitext2': - self.calib_dataset = load_dataset( - 'wikitext', 'wikitext-2-raw-v1', split='train' - ) - elif self.calib_dataset_name == 'ptb': - self.calib_dataset = load_dataset( - 'ptb_text_only', 'penn_treebank', split='train' - ) - else: - raise Exception(f'Not support {self.calib_dataset_name} dataset.') + if self.download: + if self.calib_dataset_name == 'pileval': + self.calib_dataset = load_dataset( + 'mit-han-lab/pile-val-backup', split='validation' + ) + elif self.calib_dataset_name == 'c4': + self.calib_dataset = load_dataset( + 'allenai/c4', + data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, + split='train', + ) + elif self.calib_dataset_name == 'wikitext2': + self.calib_dataset = load_dataset( + 'wikitext', 'wikitext-2-raw-v1', split='train' + ) + elif self.calib_dataset_name == 'ptb': + self.calib_dataset = load_dataset( + 'ptb_text_only', 'penn_treebank', split='train' + ) else: - if not self.load_from_txt: - # Need to pre-download the dataset. - self.calib_dataset = load_from_disk(self.calib_dataset_path) - else: - """Load dataset from your custom txt file. - - Each line in the txt file represents one input text data. - """ - assert self.calib_dataset_path.endswith('.txt') - logger.info(f'calib_dataset_path: {self.calib_dataset_path}') - with open(self.calib_dataset_path, 'r') as fp: - lines = fp.readlines() - self.calib_dataset = [] - for line in lines: - self.calib_dataset.append(line.strip()) - elif self.calib_dataset_type == 'img_txt': - self.calib_dataset = [] - logger.info(f'calib_dataset_path: {self.calib_dataset_path}') - for root, _, files in os.walk(self.calib_dataset_path): - for name in files: - if name.endswith('.jpg') or name.endswith('.png'): - img_path = os.path.join(root, name) - qa_path = os.path.join(root, name.split('.')[0] + '.json') - try: - with open(qa_path, 'r') as json_file: - data = json.load(json_file) - for qa in data: - question = qa['question'] - gt_answer = qa['answer'] - prompt = ( - f'USER: \n{question}\nASSISTANT: {gt_answer}' - ) - raw_image = Image.open(img_path) - self.calib_dataset.append((prompt, raw_image)) - except FileNotFoundError: - logger.warning(f'QA file not found for image: {img_path}') - except Exception as e: - logger.error( - f'Error processing image {img_path} and' - f'QA file {qa_path}: {e}' - ) - elif self.calib_dataset_type == 'img': - self.calib_dataset = [] - logger.info(f'calib_dataset_path: {self.calib_dataset_path}') - for root, _, files in os.walk(self.calib_dataset_path): - for name in files: - if name.endswith(('.jpg', '.png', '.JPEG')): - img_path = os.path.join(root, name) - raw_image = Image.open(img_path).convert('RGB') - self.calib_dataset.append(raw_image) - if len(self.calib_dataset) == self.n_samples: - return + raise Exception(f'Not support {self.calib_dataset_name} dataset.') else: - raise ValueError(f'Unsupported data type: {self.calib_dataset_type}') + if not self.load_from_txt: + # Need to pre-download the dataset. + self.calib_dataset = load_from_disk(self.calib_dataset_path) + else: + """Load dataset from your custom txt file. + + Each line in the txt file represents one input text data. + """ + assert self.calib_dataset_path.endswith('.txt') + logger.info(f'calib_dataset_path: {self.calib_dataset_path}') + with open(self.calib_dataset_path, 'r') as fp: + lines = fp.readlines() + self.calib_dataset = [] + for line in lines: + self.calib_dataset.append(line.strip()) def get_calib_samples(self): if self.preproc == 'general': samples = self.general_preproc( self.calib_dataset, self.tokenizer, self.n_samples, self.seq_len ) - elif self.preproc.startswith(('vlm_', 'img_')): - preproc = PREPROC_REGISTRY[self.preproc] - samples = preproc(self.calib_dataset, self.processor, self.n_samples) else: preproc = PREPROC_REGISTRY[self.preproc] samples = preproc( - self.calib_dataset, self.tokenizer, - self.n_samples, self.seq_len + self.calib_dataset, self.tokenizer, self.n_samples, self.seq_len ) return samples - def txt_group_samples_with_mask(self, samples): - calib_samples = [] - input_ids = [] - attention_mask = [] - pad_token_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token) - if self.calib_bs < 0: - samples_len = [sample.shape[-1] for sample in samples] - max_len = max(samples_len) - samples_tmp = [] - attention_mask_tmp = [] - for sample in samples: - samples_tmp.append( - F.pad(sample, [0, max_len - sample.shape[-1]], value=pad_token_id) - ) - attention_mask_tmp.append( - F.pad( - torch.ones(1, sample.shape[-1], dtype=torch.int64), - [0, max_len - sample.shape[-1]], - value=0 - ) - ) - batch_input_ids = torch.cat(samples_tmp, dim=0) - batch_attention_mask = torch.cat(attention_mask_tmp, dim=0) - calib_samples.append( - {'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask} - ) - elif self.calib_bs == 1: - input_ids = samples - attention_mask = [torch.ones(1, sample.shape[-1], dtype=torch.int64) for sample in samples] # noqa - for i in range(len(samples)): - calib_samples.append( - {'input_ids': input_ids[i], 'attention_mask': attention_mask[i]} - ) - elif self.calib_bs > 1: - for i in range(0, len(samples), self.calib_bs): - start = i - end = min(i + self.calib_bs, len(samples)) - batch_samples = samples[start:end] - batch_samples_len = [sample.shape[-1] for sample in batch_samples] - batch_max_len = max(batch_samples_len) - samples_tmp = [] - attention_mask_tmp = [] - for sample in batch_samples: - samples_tmp.append( - F.pad( - sample, - [0, batch_max_len - sample.shape[-1]], - value=pad_token_id - ) - ) - attention_mask_tmp.append( - F.pad( - torch.ones(1, sample.shape[-1], dtype=torch.int64), - [0, batch_max_len - sample.shape[-1]], - value=0 - ) - ) - batch_input_ids = torch.cat(samples_tmp, dim=0) - batch_attention_mask = torch.cat(attention_mask_tmp, dim=0) - calib_samples.append( - { - 'input_ids': batch_input_ids, - 'attention_mask': batch_attention_mask - } - ) - return calib_samples - - def txt_group_samples_wo_mask(self, samples): # without mask + def get_calib_dataset(self): + samples = self.get_calib_samples() calib_samples = [] if self.calib_bs < 0: batch = torch.cat(samples, dim=0) - calib_samples.append({'input_ids': batch}) - elif self.calib_bs == 1: - for i in range(len(samples)): - calib_samples.append({'input_ids': samples[i]}) - elif self.calib_bs > 1: - for i in range(0, len(samples), self.calib_bs): - start = i - end = min(i + self.calib_bs, len(samples)) - batch = samples[start:end] - batch = torch.cat(batch, dim=0) - calib_samples.append({'input_ids': batch}) - return calib_samples - - def img_txt_group_samples_wo_mask(self, samples): # without mask - calib_samples = [] - if self.calib_bs < 0: - batch = self.processor( - text=samples['prompts'], - images=samples['raw_images'], - return_tensors='pt', - padding=True - ) - calib_samples.append(batch) - elif self.calib_bs == 1: - for prompt, raw_image in zip(samples['prompts'], samples['raw_images']): - batch = self.processor( - text=prompt, - images=raw_image, - return_tensors='pt' - ) - calib_samples.append(batch) - elif self.calib_bs > 1: - for i in range(0, len(samples['prompts']), self.calib_bs): - start = i - end = min(i + self.calib_bs, len(samples['prompts'])) - batch = self.processor( - text=samples['prompts'][start:end], - images=samples['raw_images'][start:end], - return_tensors='pt', - padding=True - ) - calib_samples.append(batch) - return calib_samples - - def img_group_samples_wo_mask(self, samples): # without mask - calib_samples = [] - if self.calib_bs < 0: - batch = {'pixel_values': torch.cat([sample['pixel_values'] - for sample in samples], dim=0)} calib_samples.append(batch) elif self.calib_bs == 1: calib_samples = samples @@ -263,45 +95,10 @@ def img_group_samples_wo_mask(self, samples): # without mask start = i end = min(i + self.calib_bs, len(samples)) batch = samples[start:end] - batch = {'pixel_values': torch.cat([sample['pixel_values'] - for sample in batch], dim=0)} + batch = torch.cat(batch, dim=0) calib_samples.append(batch) - return calib_samples - - def get_calib_dataset(self): - samples = self.get_calib_samples() - if self.calib_dataset_type in ['txt', 'img']: - logger.info(f'len(samples) all : {len(samples)}') - assert len(samples) % int(os.environ['WORLD_SIZE']) == 0 - samples = samples[int(os.environ['RANK'])::int(os.environ['WORLD_SIZE'])] - logger.info(f'len(samples) rank : {len(samples)}') - elif self.calib_dataset_type == 'img_txt': - samples_len = len(samples['prompts']) - logger.info(f'len(samples) all : {samples_len}') - assert samples_len % int(os.environ['WORLD_SIZE']) == 0 - rank = int(os.environ['RANK']) - world_size = int(os.environ['WORLD_SIZE']) - samples = { - 'prompts': samples['prompts'][rank::world_size], - 'raw_images': samples['raw_images'][rank::world_size] - } - logger.info(f'len(samples) rank : {samples_len}') - calib_samples = [] - if self.calib_dataset_type == 'txt': - if self.padding: - calib_samples = self.txt_group_samples_with_mask(samples) - else: - calib_samples = self.txt_group_samples_wo_mask(samples) - elif self.calib_dataset_type == 'img': - calib_samples = self.img_group_samples_wo_mask(samples) - elif self.calib_dataset_type == 'img_txt': - calib_samples = self.img_txt_group_samples_wo_mask(samples) logger.info(f'len(calib_samples) : {len(calib_samples)}') - if self.padding: - padding_mask = [calib_sample['attention_mask'] for calib_sample in calib_samples] # noqa - else: - padding_mask = None - return calib_samples, padding_mask + return calib_samples def general_preproc(self, calib_dataset, tokenizer, n_samples, seq_len): dataset = calib_dataset.shuffle(seed=self.seed) diff --git a/llmc/data/dataset/specified_preproc.py b/llmc/data/dataset/specified_preproc.py index ab7372a09..acbf51a76 100644 --- a/llmc/data/dataset/specified_preproc.py +++ b/llmc/data/dataset/specified_preproc.py @@ -96,81 +96,7 @@ def pileval_omni(calib_dataset, tokenizer, n_samples, seq_len): j = i + seq_len inp = trainenc.input_ids[:, i:j] samples.append(inp) - return samples - - -@PREPROC_REGISTRY -def vlm_native(calib_dataset, processor, n_samples): - random.shuffle(calib_dataset) - samples = { - 'prompts': [], - 'raw_images': [] - } - n_run = 0 - for data in calib_dataset: - prompt, raw_image = data - samples['prompts'].append(prompt) - samples['raw_images'].append(raw_image) - n_run += 1 - if n_run == n_samples: - break - return samples - - -@PREPROC_REGISTRY -def vlm_divide_equal(calib_dataset, processor, n_samples): - samples_native = vlm_native(calib_dataset, processor, n_samples) - inputs = processor('\n\n'.join(samples_native['prompts']), return_tensors='pt') - samples = { - 'prompts': [], - 'raw_images': [] - } - total_len = inputs.input_ids.shape[1] - seq_len = total_len // n_samples - for i in range(n_samples): - s = i * seq_len - e = (i + 1) * seq_len - token_ids = inputs.input_ids[:, s:e] - prompt = processor.decode(token_ids.squeeze(), skip_special_tokens=True) - prompt = prompt.replace('USER:', 'USER: ') - samples['prompts'].append(prompt) - samples['raw_images'].append(samples_native['raw_images'][i]) - return samples - - -@PREPROC_REGISTRY -def vlm_clip_min(calib_dataset, processor, n_samples): - samples_native = vlm_native(calib_dataset, processor, n_samples) - samples = { - 'prompts': [], - 'raw_images': [] - } - trainenc = [ - processor(prompt, return_tensors='pt') - for prompt in samples_native['prompts'] - ] - min_len = min(enc.input_ids.shape[1] for enc in trainenc) - for i in range(n_samples): - token_ids = trainenc[i].input_ids[:, :min_len] - prompt = processor.decode(token_ids.squeeze(), skip_special_tokens=True) - prompt = prompt.replace('USER:', 'USER: ') - samples['prompts'].append(prompt) - samples['raw_images'].append(samples_native['raw_images'][i]) - return samples - - -@PREPROC_REGISTRY -def img_sampler(calib_dataset, processor, n_samples): - random.shuffle(calib_dataset) - samples = [] - n_run = 0 - for image in calib_dataset: - inp = processor(images=image, return_tensors='pt') - samples.append(inp) - n_run += 1 - if n_run == n_samples: - break - return samples + return samples, None @PREPROC_REGISTRY @@ -184,15 +110,3 @@ def random_truncate_txt(calib_dataset, tokenizer, n_samples, seq_len): inp = trainenc.input_ids[:, i:j] samples.append(inp) return samples - - -@PREPROC_REGISTRY -def original_txt(calib_dataset, tokenizer, n_samples, seq_len=None): - random.shuffle(calib_dataset) - n_samples = min(n_samples, len(calib_dataset)) - samples = [] - for i in range(n_samples): - trainenc = tokenizer(calib_dataset[i], return_tensors='pt') - inp = trainenc.input_ids - samples.append(inp) - return samples diff --git a/llmc/data/dataset/train_dataset.py b/llmc/data/dataset/train_dataset.py new file mode 100644 index 000000000..a1f03cfed --- /dev/null +++ b/llmc/data/dataset/train_dataset.py @@ -0,0 +1,62 @@ +import torch +from loguru import logger + + +class TrainJsonDataset(torch.utils.data.IterableDataset): + def __init__(self, dataset, tokenizer, block_size) -> None: + raw_data = dataset + self.tokenizer = tokenizer + self.block_size = block_size + tokenized_datasets = [] + for d in raw_data: + tokenized_datasets.append(self.tokenize_function(d)) + + grouped_dataset = self.group_texts(tokenized_datasets) + self.input_ids = grouped_dataset['input_ids'] + self.labels = grouped_dataset['labels'] + self.data = [ + dict(input_ids=self.input_ids[i], labels=self.labels[i]) + for i in range(len(self.input_ids)) + ] + + def __len__(self): + return len(self.data) + + def __getitem__(self, i): + return dict(input_ids=self.input_ids[i], labels=self.labels[i]) + + def __iter__(self): + return iter(self.data) + + def tokenize_function(self, examples): + return self.tokenizer(examples['text']) + + def group_texts(self, examples): + # Concatenate all texts. + # Initialize an empty dictionary + concatenated_examples = {} + + # Loop through the list of dictionaries + for d in examples: + # Loop through the keys in each dictionary + for key in d.keys(): + # If the key is not already a key in the dict_of_lists, create a new list + if key not in concatenated_examples: + concatenated_examples[key] = [] + # Append the value to the list associated with the key in dict_of_lists + concatenated_examples[key].extend(d[key]) + total_length = len(concatenated_examples['input_ids']) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= self.block_size: + total_length = (total_length // self.block_size) * self.block_size + # Split by chunks of max_len. + result = { + k: [ + t[i : i + self.block_size] + for i in range(0, total_length, self.block_size) + ] + for k, t in concatenated_examples.items() + } + result['labels'] = result['input_ids'].copy() + return result diff --git a/llmc/data/tokenizer/base_tokenizer.py b/llmc/data/tokenizer/base_tokenizer.py index f179fdbb7..c7d7429d2 100644 --- a/llmc/data/tokenizer/base_tokenizer.py +++ b/llmc/data/tokenizer/base_tokenizer.py @@ -1,4 +1,3 @@ -import warnings from abc import ABCMeta from transformers import AutoTokenizer @@ -18,13 +17,9 @@ def __str__(self): return str(self.tokenizer) def build_tokenizer(self): - try: - self.tokenizer = AutoTokenizer.from_pretrained( - self.tokenizer_path, use_fast=self.use_fast, trust_remote_code=True - ) - except Exception as e: - self.tokenizer = None - warnings.warn(f'Failed to load tokenizer. Error: {str(e)}') + self.tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer_path, use_fast=self.use_fast, trust_remote_code=True + ) def get_tokenizer(self): return self.tokenizer diff --git a/llmc/eval/__init__.py b/llmc/eval/__init__.py index 7fd4c3b60..88563ee5b 100644 --- a/llmc/eval/__init__.py +++ b/llmc/eval/__init__.py @@ -1,3 +1 @@ -from .eval_acc import AccuracyEval from .eval_ppl import PerplexityEval -from .eval_token_consist import TokenConsistencyEval diff --git a/llmc/eval/eval_ppl.py b/llmc/eval/eval_ppl.py index 9b07e5b45..c925acdb8 100644 --- a/llmc/eval/eval_ppl.py +++ b/llmc/eval/eval_ppl.py @@ -6,13 +6,98 @@ from datasets import load_dataset, load_from_disk from loguru import logger -from .eval_base import BaseEval +class PerplexityEval: + def __init__(self, tokenizer, eval_cfg): + self.tokenizer = tokenizer + # eval_cfg + logger.info(f'eval_cfg : {eval_cfg}') + self.dataset = eval_cfg['name'] + assert self.dataset in [ + 'wikitext2', + 'c4', + 'ptb', + ], 'Ppl eval only support wikitext2, c4, ptb dataset now.' + self.seq_len = eval_cfg['seq_len'] + self.bs = eval_cfg['bs'] + self.path = eval_cfg.get('path', None) + self.download = eval_cfg['download'] + self.inference_per_block = eval_cfg.get('inference_per_block', False) + self.testenc = self.build_data() -class PerplexityEval(BaseEval): + @torch.no_grad() + def build_data(self): + # load data + if self.download: + if self.dataset == 'wikitext2': + testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + elif self.dataset == 'c4': + testdata = load_dataset( + 'allenai/c4', + data_files={ + 'validation': 'en/c4-validation.00000-of-00008.json.gz' + }, + split='validation', + ) + elif self.dataset == 'ptb': + testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + else: + assert self.path, 'Please set path in eval_cfg.' + testdata = load_from_disk(self.path) + + # encode data + if self.dataset == 'wikitext2': + testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt') + elif self.dataset == 'c4': + testenc = self.tokenizer( + ' '.join(testdata[:1100]['text']), return_tensors='pt' + ) + testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)] + elif self.dataset == 'ptb': + testenc = self.tokenizer( + ' '.join(testdata['sentence']), return_tensors='pt' + ) + return testenc + + @torch.no_grad() + def eval(self, model_llmc): + model = model_llmc.get_model() + if self.inference_per_block: + handles = [] + for layer in model_llmc.get_blocks(): + handles.append(layer.register_forward_pre_hook(self.forward_pre_hook)) + for layer in model_llmc.get_blocks(): + handles.append(layer.register_forward_hook(self.forward_hook)) + for layer in model_llmc.get_layers_except_blocks(): + layer.cuda() + else: + model.cuda() + + model.eval() + ppl = self.eval_ppl_func(model, self.testenc, self.seq_len, self.bs) + if self.inference_per_block: + for h in handles: + h.remove() + model.cpu() + gc.collect() + torch.cuda.empty_cache() + return ppl + + @torch.no_grad() + def forward_pre_hook(self, m, x): + m.cuda() + + @torch.no_grad() + def forward_hook(self, m, x, y): + with ThreadPoolExecutor() as executor: + executor.submit(self.load_layer_to_cpu, m) + + @torch.no_grad() + def load_layer_to_cpu(self, m): + m.cpu() @torch.no_grad() - def eval_func(self, org_model, model, testenc, seq_len, bs): + def eval_ppl_func(self, model, testenc, seq_len, bs): testenc = testenc.input_ids nsamples = testenc.numel() // seq_len @@ -73,7 +158,7 @@ def eval_func(self, org_model, model, testenc, seq_len, bs): parser.add_argument('--model_path', type=str, required=True) args = parser.parse_args() - tokenizer = BaseTokenizer(args.model_path, tokenizer_mode='fast') + tokenizer = BaseTokenizer(args.model_path) model = MODEL_REGISTRY[args.model_type](args.model_path, 'auto') # Llama2-70B config example diff --git a/llmc/eval/eval_token.py b/llmc/eval/eval_token.py new file mode 100644 index 000000000..21ead8cc7 --- /dev/null +++ b/llmc/eval/eval_token.py @@ -0,0 +1,185 @@ +import gc +from concurrent.futures import ThreadPoolExecutor + +import torch +import torch.nn as nn +from datasets import load_dataset, load_from_disk +from loguru import logger + + +class TokenConsistencyEval: + def __init__(self, tokenizer, eval_cfg): + self.tokenizer = tokenizer + # eval_cfg + logger.info(f'eval_cfg : {eval_cfg}') + self.dataset = eval_cfg['name'] + assert self.dataset in [ + 'wikitext2', + 'c4', + 'ptb', + ], 'Token consistency eval only supports wikitext2, c4, ptb datasets now.' + self.seq_len = eval_cfg['seq_len'] + self.bs = eval_cfg['bs'] + self.path = eval_cfg.get('path', None) + self.download = eval_cfg['download'] + self.inference_per_block = eval_cfg.get('inference_per_block', False) + self.testenc = self.build_data() + + @torch.no_grad() + def build_data(self): + # load data + if self.download: + if self.dataset == 'wikitext2': + testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + elif self.dataset == 'c4': + testdata = load_dataset( + 'allenai/c4', + data_files={ + 'validation': 'en/c4-validation.00000-of-00008.json.gz' + }, + split='validation', + ) + elif self.dataset == 'ptb': + testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + else: + assert self.path, 'Please set path in eval_cfg.' + testdata = load_from_disk(self.path) + + # encode data + if self.dataset == 'wikitext2': + testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt') + elif self.dataset == 'c4': + testenc = self.tokenizer( + ' '.join(testdata[:1100]['text']), return_tensors='pt' + ) + testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)] + elif self.dataset == 'ptb': + testenc = self.tokenizer( + ' '.join(testdata['sentence']), return_tensors='pt' + ) + return testenc + + @torch.no_grad() + def eval(self, model_llmc_1, model_llmc_2): + model1 = model_llmc_1.get_model() + model2 = model_llmc_2.get_model() + + if self.inference_per_block: + handles1 = [] + handles2 = [] + for layer in model_llmc_1.get_blocks(): + handles1.append(layer.register_forward_pre_hook(self.forward_pre_hook)) + handles1.append(layer.register_forward_hook(self.forward_hook)) + for layer in model_llmc_2.get_blocks(): + handles2.append(layer.register_forward_pre_hook(self.forward_pre_hook)) + handles2.append(layer.register_forward_hook(self.forward_hook)) + for layer in model_llmc_1.get_layers_except_blocks(): + layer.cuda() + for layer in model_llmc_2.get_layers_except_blocks(): + layer.cuda() + else: + model1.cuda() + model2.cuda() + + model1.eval() + model2.eval() + + consistency = self.eval_token_consistency(model1, model2, self.testenc, self.seq_len, self.bs) + + if self.inference_per_block: + for h in handles1 + handles2: + h.remove() + + model1.cpu() + model2.cpu() + gc.collect() + torch.cuda.empty_cache() + return consistency + + @torch.no_grad() + def forward_pre_hook(self, m, x): + m.cuda() + + @torch.no_grad() + def forward_hook(self, m, x, y): + with ThreadPoolExecutor() as executor: + executor.submit(self.load_layer_to_cpu, m) + + @torch.no_grad() + def load_layer_to_cpu(self, m): + m.cpu() + + @torch.no_grad() + def eval_token_consistency(self, model1, model2, testenc, seq_len, bs): + testenc = testenc.input_ids + nsamples = testenc.numel() // seq_len + + consistent_tokens = 0 + total_tokens = 0 + + # Loop through each batch + for i in range(0, nsamples, bs): + logger.info(f'index : {(i + 1) // bs}/{nsamples // bs}') + # Calculate end index + j = min(i + bs, nsamples) + + # Prepare inputs and move to gpu + inputs = testenc[:, (i * seq_len): (j * seq_len)].cuda() + inputs = inputs.reshape(j - i, seq_len) + + # Forward pass through the models + logits1 = model1(inputs).logits + logits2 = model2(inputs).logits + + # Get predicted tokens + preds1 = torch.argmax(logits1, dim=-1) + preds2 = torch.argmax(logits2, dim=-1) + + # Compare tokens for consistency + consistent_tokens += (preds1 == preds2).sum().item() + total_tokens += preds1.numel() + + # Calculate consistency ratio + consistency_ratio = consistent_tokens / total_tokens + + # Empty CUDA cache to save memory + testenc.cpu() + torch.cuda.empty_cache() + + return consistency_ratio + + +if __name__ == '__main__': + import sys + + sys.path.append('../../') + import argparse + + from llmc.data import BaseTokenizer + from llmc.models import Llama + from llmc.utils.registry_factory import MODEL_REGISTRY + + parser = argparse.ArgumentParser() + parser.add_argument('--model_type_1', type=str, required=True) + parser.add_argument('--model_path_1', type=str, required=True) + parser.add_argument('--model_type_2', type=str, required=True) + parser.add_argument('--model_path_2', type=str, required=True) + args = parser.parse_args() + + tokenizer = BaseTokenizer(args.model_path_1, tokenizer_mode='slow') + model1 = MODEL_REGISTRY[args.model_type_1](args.model_path_1, 'auto') + model2 = MODEL_REGISTRY[args.model_type_2](args.model_path_2, 'auto') + + # Llama2-70B config example + eval_cfg = { + 'name': 'wikitext2', + 'seq_len': 2048, + 'bs': 1, + 'download': False, + 'path': '/home/gushiqiao/nvme/gushiqiao/llm_datasets/eval/wikitext2', + 'inference_per_block': False, + } + token_consistency_eval = TokenConsistencyEval(tokenizer.get_tokenizer(), eval_cfg) + + consistency_ratio = token_consistency_eval.eval(model1, model2) + logger.info(f'Token consistency ratio: {consistency_ratio}') diff --git a/llmc/models/__init__.py b/llmc/models/__init__.py index 9e498f1e3..0fbf1d53f 100644 --- a/llmc/models/__init__.py +++ b/llmc/models/__init__.py @@ -1,22 +1,11 @@ from .bloom import Bloom -from .deepseekv2 import DeepseekV2 from .falcon import Falcon from .gemma2 import Gemma2 from .internlm2 import InternLM2 -from .internomni import InternOmni -from .internvl2 import InternVL2 from .llama import Llama from .llava import Llava -from .minicpm import MiniCPM from .mistral import Mistral from .mixtral import Mixtral from .opt import Opt -from .phi import Phi -from .qwen import Qwen from .qwen2 import Qwen2 -from .qwen2moe import Qwen2Moe -from .qwenvl import QwenVL -from .smollm import SmolLM -from .stablelm import StableLm from .starcoder import Starcoder -from .vit import Vit diff --git a/llmc/models/base_model.py b/llmc/models/base_model.py index 1695043a9..b3a34be42 100644 --- a/llmc/models/base_model.py +++ b/llmc/models/base_model.py @@ -17,19 +17,14 @@ class BaseModel(metaclass=ABCMeta): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): + def __init__(self, model_path, torch_dtype): self.model_path = model_path self.torch_dtype = torch_dtype if torch_dtype == 'auto' else eval(torch_dtype) - self.device_map = device_map - self.use_cache = use_cache - self.vlm_model = None - self.processor = None self.build_model() self.model.eval() self.find_blocks() self.find_embed_layers() self.find_block_name() - self.add_layernorms_class() @abstractmethod def find_blocks(self): @@ -60,17 +55,10 @@ def get_layers_except_blocks(self): def get_subsets_in_block(self, block): pass - @abstractmethod - def skip_layer_name(self): - pass - @abstractmethod def has_bias(self): pass - def get_attention_rotary_layers(self): - return [] - def __str__(self): return f'\nConfig: \n{str(self.model_config)} \nModel: \n{str(self.model)}' @@ -78,34 +66,19 @@ def build_model(self): self.model_config = AutoConfig.from_pretrained( self.model_path, trust_remote_code=True ) - if not self.use_cache: - if hasattr(self.model_config, 'use_cache'): - self.model_config.use_cache = False + if hasattr(self.model_config, 'use_cache'): + self.model_config.use_cache = False logger.info(f'self.model_config : {self.model_config}') self.model = AutoModelForCausalLM.from_pretrained( self.model_path, config=self.model_config, - device_map=self.device_map, trust_remote_code=True, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, ) - def add_layernorms_class(self): - ln_class_list = [] - single_block = self.get_blocks()[0] - ln_dict = self.get_layernorms_in_block(single_block) - for ln_name in ln_dict: - ln_class = ln_dict[ln_name].__class__ - if ln_class not in ln_class_list: - ln_class_list.append(ln_class) - for ln_class in ln_class_list: - if ln_class not in _TRANSFORMERS_LN_TYPES_: - _TRANSFORMERS_LN_TYPES_.append(ln_class) - logger.info(f'_TRANSFORMERS_LN_TYPES_ : {_TRANSFORMERS_LN_TYPES_}') - @torch.no_grad() - def collect_first_block_input(self, calib_data, data_type='txt'): + def collect_first_block_input(self, calib_data): first_block_input = defaultdict(list) class Catcher(nn.Module): @@ -122,38 +95,15 @@ def forward(self, inp, **kwargs): raise ValueError self.move_embed_to_device('cuda') - if data_type == 'img_txt': - self.vision_tower = self.vision_tower.to('cuda') - self.multi_modal_projector = self.multi_modal_projector.to('cuda') self.blocks[0] = self.blocks[0].cuda() self.blocks[0] = Catcher(self.blocks[0]) for data in calib_data: try: - if data_type == 'txt': - data = { - k: v.to(next(self.model.parameters()).device) - for k, v in data.items() - } - self.model(**data) - elif data_type == 'img': - data = { - k: v.to(next(self.model.parameters()).device) - for k, v in data.items() - } - self.model(**data) - elif data_type == 'img_txt': - data = { - k: v.to(next(self.model.parameters()).device) - for k, v in data.items() - } - self.vlm_model.generate(**data, max_new_tokens=200, do_sample=False) + self.model(data.to(next(self.model.parameters()).device)) except ValueError: pass self.first_block_input = first_block_input - if data_type == 'img_txt': - self.vision_tower = self.vision_tower.cpu() - self.multi_modal_projector = self.multi_modal_projector.cpu() self.blocks[0] = self.blocks[0].module self.blocks[0] = self.blocks[0].cpu() self.move_embed_to_device('cpu') @@ -166,9 +116,7 @@ def get_model_config(self): def move_embed_to_device(self, device): for embed_layer in self.get_embed_layers(): - embed_layer.to(device) - for attention_rotary_layer in self.get_attention_rotary_layers(): - attention_rotary_layer.to(device) + embed_layer = embed_layer.to(device) def get_block_linears(self, block): return { @@ -177,9 +125,6 @@ def get_block_linears(self, block): if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)) } - def get_extra_modules(self, block): - return {} - def set_mix_bits_params_dict(self, block_idx, name, params_dict): logger.info('set_mix_bits_params_dict') @@ -241,16 +186,13 @@ def set_mix_bits_params_dict(self, block_idx, name, params_dict): params_mix_dict['a_qdq'] = None return params_mix_dict - def replace_module_all(self, module, params_dict, keep_device=False): + def replace_module_all(self, module, params_dict): for block_idx in range(len(self.blocks)): logger.info(f'Replace block index: {block_idx}/{len(self.blocks)}') block = self.blocks[block_idx] - if keep_device: - self.replace_module_block(module, block, block_idx, params_dict) - else: - block = block.cuda() - self.replace_module_block(module, block, block_idx, params_dict) - block = block.cpu() + block = block.cuda() + self.replace_module_block(module, block, block_idx, params_dict) + block = block.cpu() gc.collect() torch.cuda.empty_cache() diff --git a/llmc/models/bloom.py b/llmc/models/bloom.py index 16980a87c..34b0e9eae 100644 --- a/llmc/models/bloom.py +++ b/llmc/models/bloom.py @@ -5,8 +5,8 @@ @MODEL_REGISTRY class Bloom(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.transformer.h @@ -31,9 +31,6 @@ def get_layers_except_blocks(self): self.model.transformer.ln_f, ] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return True diff --git a/llmc/models/falcon.py b/llmc/models/falcon.py index 90be4f2c2..8c9cef614 100644 --- a/llmc/models/falcon.py +++ b/llmc/models/falcon.py @@ -5,15 +5,14 @@ @MODEL_REGISTRY class Falcon(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.transformer.h def find_embed_layers(self): self.word_embeddings = self.model.transformer.word_embeddings - self.rotary_emb = self.model.model.rotary_emb def find_block_name(self): self.block_name_prefix = 'model.transformer.h' @@ -21,11 +20,8 @@ def find_block_name(self): def get_embed_layers(self): return [self.word_embeddings] - def get_attention_rotary_layers(self): - return [self.rotary_emb] - def get_layers_except_blocks(self): - return [self.word_embeddings, self.rotary_emb, self.model.transformer.ln_f] + return [self.word_embeddings, self.model.transformer.ln_f] def has_bias(self): return False diff --git a/llmc/models/gemma2.py b/llmc/models/gemma2.py index 402a66153..b4696f921 100644 --- a/llmc/models/gemma2.py +++ b/llmc/models/gemma2.py @@ -1,34 +1,12 @@ -from loguru import logger - from llmc.utils.registry_factory import MODEL_REGISTRY -try: - from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm -except Exception: - logger.warning('Gemma2 not found') -from types import MethodType - -import torch.nn as nn - from .base_model import BaseModel -def gemma2_rms_norm_forward(self, x): - output = self._norm(x.float()) - output = output * self.weight.float() - return output.type_as(x) - - @MODEL_REGISTRY class Gemma2(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) - for m in self.model.modules(): - if isinstance(m, Gemma2RMSNorm): - w = m.weight.data - del m.weight - m.weight = nn.Parameter(w + 1.0) - m.forward = MethodType(gemma2_rms_norm_forward, m) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers @@ -43,18 +21,9 @@ def find_block_name(self): def get_embed_layers(self): return [self.embed_tokens] - def get_head_layers(self): - return [self.model.lm_head] - - def get_pre_head_layernorm_layers(self): - return [self.model.model.norm] - def get_layers_except_blocks(self): return [self.embed_tokens, self.model.model.norm, self.model.lm_head] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return False @@ -93,7 +62,6 @@ def get_subsets_in_block(self, block): 'input': ['mlp.gate_proj'], 'inspect': block.mlp, 'has_kwargs': False, - 'is_mlp': True, }, { 'layers': {'mlp.down_proj': block.mlp.down_proj}, @@ -101,6 +69,5 @@ def get_subsets_in_block(self, block): 'input': ['mlp.down_proj'], 'inspect': block.mlp.down_proj, 'has_kwargs': False, - 'is_mlp': True, }, ] diff --git a/llmc/models/internlm2.py b/llmc/models/internlm2.py index 39f1b57eb..5e17c0d90 100644 --- a/llmc/models/internlm2.py +++ b/llmc/models/internlm2.py @@ -1,4 +1,3 @@ -from llmc.compression.quantization.module_utils import _TRANSFORMERS_LN_TYPES_ from llmc.utils.registry_factory import MODEL_REGISTRY from .base_model import BaseModel @@ -6,10 +5,8 @@ @MODEL_REGISTRY class InternLM2(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) - global _TRANSFORMERS_LN_TYPES_ - _TRANSFORMERS_LN_TYPES_ += [type(self.model.model.norm)] + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers @@ -23,18 +20,9 @@ def find_block_name(self): def get_embed_layers(self): return [self.tok_embeddings] - def get_head_layers(self): - return [self.model.output] - - def get_pre_head_layernorm_layers(self): - return [self.model.model.norm] - def get_layers_except_blocks(self): return [self.tok_embeddings, self.model.model.norm, self.model.output] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return False @@ -69,7 +57,6 @@ def get_subsets_in_block(self, block): 'input': ['feed_forward.w1'], 'inspect': block.feed_forward, 'has_kwargs': False, - 'is_mlp': True, }, { 'layers': {'feed_forward.w2': block.feed_forward.w2}, @@ -77,6 +64,5 @@ def get_subsets_in_block(self, block): 'input': ['feed_forward.w2'], 'inspect': block.feed_forward.w2, 'has_kwargs': False, - 'is_mlp': True, }, ] diff --git a/llmc/models/llama.py b/llmc/models/llama.py index 35e62bc86..10da9a38e 100644 --- a/llmc/models/llama.py +++ b/llmc/models/llama.py @@ -5,25 +5,21 @@ @MODEL_REGISTRY class Llama(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers def find_embed_layers(self): self.embed_tokens = self.model.model.embed_tokens - self.rotary_emb = self.model.model.rotary_emb def find_block_name(self): self.block_name_prefix = 'model.layers' self.pairs = {'q_proj': 'qkv', 'o_proj': 'out', 'up_proj': 'fc1'} def get_embed_layers(self): - return [self.embed_tokens] - - def get_attention_rotary_layers(self): - return [self.rotary_emb] + return [self.model.model.embed_tokens] def get_head_layers(self): return [self.model.lm_head] @@ -32,10 +28,7 @@ def get_pre_head_layernorm_layers(self): return [self.model.model.norm] def get_layers_except_blocks(self): - return [self.embed_tokens, self.rotary_emb, self.model.model.norm, self.model.lm_head] # noqa - - def skip_layer_name(self): - return ['lm_head'] + return [self.embed_tokens, self.model.model.norm, self.model.lm_head] def has_bias(self): return False diff --git a/llmc/models/llava.py b/llmc/models/llava.py index e6c31ea6f..bc230241b 100644 --- a/llmc/models/llava.py +++ b/llmc/models/llava.py @@ -6,7 +6,7 @@ from .llama import Llama try: - from transformers import AutoProcessor, LlavaForConditionalGeneration + from transformers import LlavaForConditionalGeneration except Exception: logger.info( 'LlavaForConditionalGeneration is not supported in this version of transfomers.' @@ -16,24 +16,18 @@ @MODEL_REGISTRY class Llava(Llama): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def build_model(self): - self.vlm_model_config = AutoConfig.from_pretrained( + self.model_config = AutoConfig.from_pretrained( self.model_path, trust_remote_code=True ) - if not self.use_cache: - self.vlm_model_config.text_config.use_cache = False - logger.info(f'self.vlm_model_config : {self.vlm_model_config}') - self.vlm_model = LlavaForConditionalGeneration.from_pretrained( + self.model_config.text_config.use_cache = False + self.llava_model = LlavaForConditionalGeneration.from_pretrained( self.model_path, - config=self.vlm_model_config, + config=self.model_config, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, ) - self.vision_tower = self.vlm_model.vision_tower - self.multi_modal_projector = self.vlm_model.multi_modal_projector - self.processor = AutoProcessor.from_pretrained(self.model_path) - self.model = self.vlm_model.language_model - self.model_config = self.vlm_model_config.text_config + self.model = self.llava_model.language_model diff --git a/llmc/models/mistral.py b/llmc/models/mistral.py index 7689b4092..be18a0b4e 100644 --- a/llmc/models/mistral.py +++ b/llmc/models/mistral.py @@ -5,8 +5,8 @@ @MODEL_REGISTRY class Mistral(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers @@ -24,9 +24,6 @@ def get_embed_layers(self): def get_layers_except_blocks(self): return [self.embed_tokens, self.model.model.norm, self.model.lm_head] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return False diff --git a/llmc/models/mixtral.py b/llmc/models/mixtral.py index fec0fcdb5..a94583101 100644 --- a/llmc/models/mixtral.py +++ b/llmc/models/mixtral.py @@ -5,8 +5,8 @@ @MODEL_REGISTRY class Mixtral(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers @@ -23,9 +23,6 @@ def get_embed_layers(self): def get_layers_except_blocks(self): return [self.embed_tokens, self.model.model.norm, self.model.lm_head] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return False @@ -35,11 +32,6 @@ def get_layernorms_in_block(self, block): 'post_attention_layernorm': block.post_attention_layernorm, } - def get_extra_modules(self, block): - return { - 'block_sparse_moe': block.block_sparse_moe - } - def get_subsets_in_block(self, block): return [ { @@ -61,25 +53,11 @@ def get_subsets_in_block(self, block): 'has_kwargs': False, }, { - 'layers': { - **{f'block_sparse_moe.experts.{i}.w1': block.block_sparse_moe.experts[i].w1 for i in range(len(block.block_sparse_moe.experts))}, # noqa - **{f'block_sparse_moe.experts.{i}.w3': block.block_sparse_moe.experts[i].w3 for i in range(len(block.block_sparse_moe.experts))}, # noqa - }, + 'layers': {'block_sparse_moe.gate': block.block_sparse_moe.gate}, 'prev_op': [block.post_attention_layernorm], - 'input': ['block_sparse_moe'], - 'inspect': block.block_sparse_moe, + 'input': ['block_sparse_moe.gate'], + 'inspect': block.block_sparse_moe.gate, 'has_kwargs': False, - 'is_mlp': True, }, - *[ - { - 'layers': {f'block_sparse_moe.experts.{i}.w2': block.block_sparse_moe.experts[i].w2}, # noqa - 'prev_op': [block.block_sparse_moe.experts[i].w3], - 'input': [f'block_sparse_moe.experts.{i}.w2'], - 'inspect': block.block_sparse_moe.experts[i].w2, - 'has_kwargs': False, - 'is_mlp': True, - } - for i in range(len(block.block_sparse_moe.experts)) - ], + # Moe layers can not transform. ] diff --git a/llmc/models/opt.py b/llmc/models/opt.py index 71e2f2114..95ac30aed 100644 --- a/llmc/models/opt.py +++ b/llmc/models/opt.py @@ -5,8 +5,8 @@ @MODEL_REGISTRY class Opt(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.decoder.layers @@ -38,9 +38,6 @@ def get_layers_except_blocks(self): layers.append(self.model.model.decoder.final_layer_norm) return layers - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return True diff --git a/llmc/models/qwen2.py b/llmc/models/qwen2.py index 25840decd..d260fcfa2 100644 --- a/llmc/models/qwen2.py +++ b/llmc/models/qwen2.py @@ -5,15 +5,14 @@ @MODEL_REGISTRY class Qwen2(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.model.layers def find_embed_layers(self): self.embed_tokens = self.model.model.embed_tokens - self.rotary_emb = self.model.model.rotary_emb def find_block_name(self): self.block_name_prefix = 'model.layers' @@ -22,20 +21,8 @@ def find_block_name(self): def get_embed_layers(self): return [self.embed_tokens] - def get_attention_rotary_layers(self): - return [self.rotary_emb] - - def get_head_layers(self): - return [self.model.lm_head] - - def get_pre_head_layernorm_layers(self): - return [self.model.model.norm] - def get_layers_except_blocks(self): - return [self.embed_tokens, self.rotary_emb, self.model.model.norm, self.model.lm_head] # noqa - - def skip_layer_name(self): - return ['lm_head'] + return [self.embed_tokens, self.model.model.norm, self.model.lm_head] def has_bias(self): return False @@ -75,7 +62,6 @@ def get_subsets_in_block(self, block): 'input': ['mlp.gate_proj'], 'inspect': block.mlp, 'has_kwargs': False, - 'is_mlp': True, }, { 'layers': {'mlp.down_proj': block.mlp.down_proj}, @@ -83,6 +69,5 @@ def get_subsets_in_block(self, block): 'input': ['mlp.down_proj'], 'inspect': block.mlp.down_proj, 'has_kwargs': False, - 'is_mlp': True, }, ] diff --git a/llmc/models/starcoder.py b/llmc/models/starcoder.py index 0a97f9b63..be4d8bc30 100644 --- a/llmc/models/starcoder.py +++ b/llmc/models/starcoder.py @@ -5,8 +5,8 @@ @MODEL_REGISTRY class Starcoder(BaseModel): - def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False): - super().__init__(model_path, torch_dtype, device_map, use_cache) + def __init__(self, model_path, torch_dtype): + super().__init__(model_path, torch_dtype) def find_blocks(self): self.blocks = self.model.transformer.h @@ -29,9 +29,6 @@ def get_layers_except_blocks(self): self.model.lm_head, ] - def skip_layer_name(self): - return ['lm_head'] - def has_bias(self): return True diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py index 574b27176..bad8b4d1d 100644 --- a/llmc/utils/__init__.py +++ b/llmc/utils/__init__.py @@ -1,4 +1 @@ -from .export_autoawq import update_autoawq_quant_config -from .export_vllm import update_vllm_quant_config -from .utils import (check_config, copy_files, mkdirs, - print_important_package_version, seed_all) +from .utils import check_config, copy_files, mkdirs, seed_all diff --git a/llmc/utils/utils.py b/llmc/utils/utils.py index 7bc1f0d5a..4c198b1ce 100644 --- a/llmc/utils/utils.py +++ b/llmc/utils/utils.py @@ -57,11 +57,6 @@ def check_weight_setting(weight_setting): config.model.tokenizer_mode = 'slow' logger.info('Tokenizer_mode is set to slow.') - if 'calib' in config and not config.calib.get('type', False): - config.calib.type = 'txt' - if 'eval' in config and not config.eval.get('type', False): - config.eval.type = 'ppl' - def mkdirs(path): if not os.path.exists(path): @@ -77,12 +72,3 @@ def copy_files(source_dir, target_dir, substring): target_file = os.path.join(target_dir, filename) shutil.copy(source_file, target_file) logger.info(f'Copied {filename} to {target_dir}') - - -def print_important_package_version(): - from importlib.metadata import version - logger.info(f"torch : {version('torch')}") - logger.info(f"transformers : {version('transformers')}") - logger.info(f"tokenizers : {version('tokenizers')}") - logger.info(f"huggingface-hub : {version('huggingface-hub')}") - logger.info(f"datasets : {version('datasets')}") diff --git a/lm-evaluation-harness b/lm-evaluation-harness deleted file mode 160000 index 86fd4ad29..000000000 --- a/lm-evaluation-harness +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 86fd4ad29b1eb168cd1c86dd37d8eb6a93ee67d2 diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 89203076e..7c6153e81 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,27 +1,10 @@ -torch>=2.1.0 -torchvision -timm -pillow +torch loguru -transformers==4.45.2 +transformers==4.44.2 +accelerate==0.31.0 +datasets==2.20.0 huggingface-hub sentencepiece protobuf -accelerate>=0.26.0 zstandard easydict -evaluate>=0.4.0 -datasets>=2.16.0 -jsonlines -numexpr -peft>=0.2.0 -pybind11>=2.6.2 -pytablewriter -rouge-score>=0.0.4 -sacrebleu>=1.5.0 -scikit-learn>=0.24.1 -sqlitedict -tqdm-multiprocess -dill -word2number -more_itertools diff --git a/scripts/run_adadim_llama.sh b/scripts/run_adadim_llama.sh new file mode 100644 index 000000000..28e2a4ba0 --- /dev/null +++ b/scripts/run_adadim_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_awq_llama.sh b/scripts/run_awq_llama.sh new file mode 100644 index 000000000..3d638583d --- /dev/null +++ b/scripts/run_awq_llama.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/Awq/awq_w4a16_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid + diff --git a/scripts/run_dgq_llama.sh b/scripts/run_dgq_llama.sh new file mode 100644 index 000000000..aa3c109be --- /dev/null +++ b/scripts/run_dgq_llama.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid + diff --git a/scripts/run_gptq_llama.sh b/scripts/run_gptq_llama.sh new file mode 100644 index 000000000..05e2609d5 --- /dev/null +++ b/scripts/run_gptq_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/GPTQ/gptq_quarot.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_gptq_owq_llama.sh b/scripts/run_gptq_owq_llama.sh new file mode 100644 index 000000000..7e0f6d22c --- /dev/null +++ b/scripts/run_gptq_owq_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_hqq_llama.sh b/scripts/run_hqq_llama.sh new file mode 100644 index 000000000..7f995c9a0 --- /dev/null +++ b/scripts/run_hqq_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_in_tmux_sequence.sh b/scripts/run_in_tmux_sequence.sh new file mode 100644 index 000000000..6534e1ae5 --- /dev/null +++ b/scripts/run_in_tmux_sequence.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + + +task_name=rtn_w8a8_fakequant_eval +echo "${task_name} running..." +python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 + + +task_name=smoothquant_llama_w8a8_fakequant_eval_general +echo "${task_name} running..." +python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml \ +> ${task_name}.log 2>&1 + + +task_name=osplus_llama_w8a8_fakequant_eval_general +echo "${task_name} running..." +python -m llmc --config ../configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml \ +> ${task_name}.log 2>&1 diff --git a/scripts/run_llmint8_llama.sh b/scripts/run_llmint8_llama.sh new file mode 100644 index 000000000..a4261cb6d --- /dev/null +++ b/scripts/run_llmint8_llama.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid + diff --git a/scripts/run_ntweak_llama.sh b/scripts/run_ntweak_llama.sh new file mode 100644 index 000000000..b94e260ae --- /dev/null +++ b/scripts/run_ntweak_llama.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid + diff --git a/scripts/run_omniq_llama.sh b/scripts/run_omniq_llama.sh new file mode 100644 index 000000000..5f7241a61 --- /dev/null +++ b/scripts/run_omniq_llama.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid + diff --git a/scripts/run_omniq_mistral.sh b/scripts/run_omniq_mistral.sh new file mode 100644 index 000000000..0164521af --- /dev/null +++ b/scripts/run_omniq_mistral.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid diff --git a/scripts/run_omniq_opt.sh b/scripts/run_omniq_opt.sh new file mode 100644 index 000000000..2e0da4b4e --- /dev/null +++ b/scripts/run_omniq_opt.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_osplus_llama.sh b/scripts/run_osplus_llama.sh new file mode 100644 index 000000000..983364620 --- /dev/null +++ b/scripts/run_osplus_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid diff --git a/scripts/run_osplus_opt.sh b/scripts/run_osplus_opt.sh new file mode 100644 index 000000000..37f666150 --- /dev/null +++ b/scripts/run_osplus_opt.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid diff --git a/scripts/run_quarot_llama.sh b/scripts/run_quarot_llama.sh new file mode 100644 index 000000000..760a7c5c1 --- /dev/null +++ b/scripts/run_quarot_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=1 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/QuaRot/quarot_w4a4.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_quik_llama.sh b/scripts/run_quik_llama.sh new file mode 100644 index 000000000..818069d8b --- /dev/null +++ b/scripts/run_quik_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_rtn_llama.sh b/scripts/run_rtn_llama.sh new file mode 100644 index 000000000..8d328a7f4 --- /dev/null +++ b/scripts/run_rtn_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_rtn_llama_static.sh b/scripts/run_rtn_llama_static.sh new file mode 100644 index 000000000..cc7e62da4 --- /dev/null +++ b/scripts/run_rtn_llama_static.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_pertensor_static.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_shortgpt_llama.sh b/scripts/run_shortgpt_llama.sh new file mode 100644 index 000000000..f56c090ae --- /dev/null +++ b/scripts/run_shortgpt_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/sparsification/ShortGPT/shortgpt.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_smoothquant_llama.sh b/scripts/run_smoothquant_llama.sh new file mode 100644 index 000000000..6715d68e0 --- /dev/null +++ b/scripts/run_smoothquant_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid diff --git a/scripts/run_smoothquant_opt.sh b/scripts/run_smoothquant_opt.sh new file mode 100644 index 000000000..38f7b616d --- /dev/null +++ b/scripts/run_smoothquant_opt.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid diff --git a/scripts/run_spinquant_llama.sh b/scripts/run_spinquant_llama.sh new file mode 100644 index 000000000..240858f88 --- /dev/null +++ b/scripts/run_spinquant_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/SpinQuant/spinquant_w4a4.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_spqr_llama.sh b/scripts/run_spqr_llama.sh new file mode 100644 index 000000000..270c61611 --- /dev/null +++ b/scripts/run_spqr_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/scripts/run_wanda_llama.sh b/scripts/run_wanda_llama.sh new file mode 100644 index 000000000..96b31c518 --- /dev/null +++ b/scripts/run_wanda_llama.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +gpu_id=0 +export CUDA_VISIBLE_DEVICES=$gpu_id + +llmc=llmc_path +export PYTHONPATH=$llmc:$PYTHONPATH + +task_name=llm_quant_exp + +nohup \ +python -m llmc --config ../configs/sparsification/Wand/wanda.yml \ +> ${task_name}.log 2>&1 & + +echo $! > ${task_name}.pid \ No newline at end of file diff --git a/tools/outlier_analysis.py b/tools/outlier_analysis.py new file mode 100644 index 000000000..474eaf7ce --- /dev/null +++ b/tools/outlier_analysis.py @@ -0,0 +1,483 @@ +import argparse +import functools +import gc +import os +import sys + +import torch +from loguru import logger +from tqdm import tqdm +from transformers import AutoConfig, AutoModelForCausalLM + +sys.path.append('..') +import matplotlib.pyplot as plt +import torch.nn as nn + +from llmc.compression.quantization import FakeQuantLinear, Quantizer +from llmc.compression.quantization.module_utils import ( + _LLMC_LINEAR_TYPES_, _TRANSFORMERS_LINEAR_TYPES_, RotateLinear) +from llmc.data import BaseDataset, BaseTokenizer +from llmc.models import * +from llmc.utils import check_config, mkdirs, seed_all +from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY + + +def calculate_kurtosis_channel(signal): + """Calculates the kurtosis of a given signal. + + Args: + signal (torch.Tensor): Input signal, shape (4096, 1024). + + Returns: + float: The average kurtosis value of the rows. + """ + signal = signal.float() + mean = torch.mean(signal, dim=1, keepdim=True) + std = torch.std(signal, dim=1, keepdim=True) + + std[std == 0] = 1e-8 # Avoid division by zero + + standardized_signal = (signal - mean) / std + kurtosis = torch.mean( + standardized_signal**4, dim=1 + ) # Calculate kurtosis for each row + + average_kurtosis = torch.mean(kurtosis) + + return average_kurtosis.item() + + +def calculate_kurtosis(signal): + """Calculates the kurtosis of a given signal. + + Args: + signal (torch.Tensor): Input signal, shape (N, *). + + Returns: + float: The kurtosis value. + """ + signal = signal.float() + signal = signal.view(1, -1) + mean = torch.mean(signal) + std = torch.std(signal) + + if std == 0: + return float('inf') + + standardized_signal = (signal - mean) / (std + 1e-8) + + kurtosis = torch.mean(standardized_signal**4) # - 3 + + return kurtosis.item() + + +def draw(save_path, save_name, X, Y1, Y2): + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + ax.plot(X, Y1) + ax.plot(X, Y2) + plt.xlabel('channel') + plt.ylabel('value') + plt.title(save_name) + fig.savefig(f'{save_path}/{save_name}.jpg') + plt.close(fig) + plt.cla() + + +def analysis_block_cosine(res, t_res, args): + cosine_sim = nn.CosineSimilarity() + + for name in res: + oups = res[name] + t_oups = t_res[name] + + layer_cosine_dict = {} + for j in range(oups.shape[0]): + cos = cosine_sim(oups[j].float().view(1, -1), t_oups[j].float().view(1, -1)) + + if name not in layer_cosine_dict: + layer_cosine_dict[name] = [] + + layer_cosine_dict[name].append(cos.item()) + + for name in layer_cosine_dict: + cos_values = layer_cosine_dict[name] + min_cos = min(cos_values) + avg_cos = sum(cos_values) / len(cos_values) + logger.info(name) + logger.info(f'min_cos : {min_cos}') + logger.info(f'avg_cos : {avg_cos}') + + +def avg_k_a(a, k): + result = (a[:, None] * k[None, :]).sum(dim=0) + + total_sum = result.sum() + print(result.shape) + + average = total_sum / result.numel() + return average + + +def analysis_block_outlier(res, t_res, org_w, trans_w, arg): + if args.prof_gra in ['per_channel', 'per_group']: + kurt_func = calculate_kurtosis_channel + else: + kurt_func = calculate_kurtosis + + for name in res: + logger.info(name) + + weight = org_w[name] + t_weight = trans_w[name] + + if args.prof_gra == 'per_group': + weight = wquanter.reshape_tensor(weight) + t_weight = wquanter.reshape_tensor(t_weight) + + k_w = kurt_func(weight) + k_t_w = kurt_func(t_weight) + + logger.info(f'The kurtosis of org weight is :{k_w}') + logger.info(f'The kurtosis of trans weight is :{k_t_w}') + + tensor = res[name].mean(dim=0) + tensor = tensor.float() + + t_tensor = t_res[name].mean(dim=0) + t_tensor = t_tensor.float() + + k_a = kurt_func(tensor) + k_t_a = kurt_func(t_tensor) + + logger.info(f'The kurtosis of org act is :{k_a}') + logger.info(f'The kurtosis of trans act is :{k_t_a}') + + if args.draw: + save_outlier_path = os.path.join(args.save_path, 'outlier') + save_t_outlier_path = os.path.join(args.save_path, 't_outlier') + + t_min_val = t_tensor.amin(dim=0).detach().cpu().numpy() + t_max_val = t_tensor.amax(dim=0).detach().cpu().numpy() + + min_val = tensor.amin(dim=0).detach().cpu().numpy() + max_val = tensor.amax(dim=0).detach().cpu().numpy() + + if not os.path.exists(args.save_path): + mkdirs(save_outlier_path) + mkdirs(save_t_outlier_path) + + draw( + save_path=save_outlier_path, + save_name=name, + X=range(tensor.shape[-1]), + Y1=min_val, + Y2=max_val, + ) + + draw( + save_path=save_t_outlier_path, + save_name=name, + X=range(t_tensor.shape[-1]), + Y1=t_min_val, + Y2=t_max_val, + ) + + +def register_hook(block, idx, args): + hooks = [] + for name, m in block.named_modules(): + if not args.cosine: + if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)): + hooks.append( + m.register_forward_hook( + functools.partial( + stat_input_hook, + w=m.weight.data, + name=name, + idx=idx, + args=args, + ) + ) + ) + else: + if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)): + hooks.append( + m.register_forward_hook( + functools.partial( + stat_output_hook, name=name, idx=idx, args=args + ) + ) + ) + + return hooks + + +def stat_input_hook(m, x, y, w, name, idx, args): + if isinstance(x, tuple): + x = x[0] + + layer_name = f'block_{idx}.{name}' + + if args.online_rotate and t: + if 'down_proj' in layer_name: + x = down_rotater.rotate(x) + elif 'o_proj' in layer_name: + x = o_rotater.rotate(x) + + if t: + t_res[layer_name] = x + trans_w[layer_name] = w + else: + res[layer_name] = x + org_w[layer_name] = w + + +def stat_output_hook(m, x, y, name, idx, args): + if isinstance(y, tuple): + y = y[0] + layer_name = f'block_{idx}.{name}' + if t: + t_res[layer_name] = y + else: + res[layer_name] = y + + +def block_forward(block, input_data, input_kwargs): + output = [] + + for i in range(len(input_data)): + input_data[i] = input_data[i].to( + device=next(block.parameters()).device, + dtype=next(block.parameters()).dtype, + ) + if ( + 'attention_mask' in input_kwargs[i] + and input_kwargs[i]['attention_mask'] is not None + ): + input_kwargs[i]['attention_mask'] = input_kwargs[i]['attention_mask'].cuda() + with torch.no_grad(): + out = block(input_data[i], **input_kwargs[i])[0] + output.append(out) + return output + + +class analysis_quanter(Quantizer): + def __init__(self, bit, symmetric, granularity, **kwargs): + super().__init__(bit, symmetric, granularity, **kwargs) + + def fake_quant_weight_dynamic(self, module, args={}): + weight = module.weight + if 'int_indices' in args: + if self.granularity == 'per_group': + assert len(args['int_indices']) % self.group_size == 0 + q_weight = weight[:, args['int_indices']] + fp_weight = weight[:, args['fp_indices']] + + elif 'dim' in args and 'ic' in args['dim']: + q_weight = weight.T + else: + q_weight = weight + + if 'current_bit' in args: + org_bit = self.bit + self.bit = args['current_bit'] + + org_w_shape = q_weight.shape + org_w_dtype = q_weight.dtype + q_weight, scales, zeros, max_int, min_int = self.get_tensor_qparams( + q_weight, args + ) + + q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int) + q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype) + + if 'current_bit' in args: + self.bit = org_bit + + if 'int_indices' in args: + mix_weight = torch.zeros_like(weight) + mix_weight[:, args['int_indices']] = q_weight + mix_weight[:, args['fp_indices']] = fp_weight + return mix_weight + + elif 'dim' in args and 'ic' in args['dim']: + q_weight = q_weight.T + + return q_weight + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_name', type=str) + parser.add_argument('--data_path', type=str) + parser.add_argument('--n_samples', type=int, default=128) + parser.add_argument('--bs', type=int, default=-1) + parser.add_argument('--seq_len', type=int, default=512) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--preproc', type=str, default='general') + parser.add_argument('--save_path', type=str, default='./save') + parser.add_argument('--draw', action='store_true') + parser.add_argument('--cosine', action='store_true') + parser.add_argument('--model_type', type=str, required=True) + parser.add_argument('--model_path', type=str, required=True) + parser.add_argument('--t_model_path', type=str) + parser.add_argument('--torch_dtype', type=str, default='auto') + parser.add_argument('--tokenizer_mode', type=str, default='slow') + + parser.add_argument('--w_only', action='store_true') + parser.add_argument('--wbit', type=int, default=6) + parser.add_argument('--wsym', action='store_true') + parser.add_argument('--wgra', type=str, default='per_channel') + parser.add_argument('--group_size', type=int, default=-1) + + parser.add_argument('--abit', type=int, default=6) + parser.add_argument('--asym', action='store_true') + parser.add_argument('--agra', type=str, default='per_token') + + parser.add_argument('--log_dir', type=str, default='log.txt') + parser.add_argument('--prof_gra', type=str, default='per_tensor') + parser.add_argument('--config_path', type=str) + + parser.add_argument('--online_rotate', action='store_true') + + args = parser.parse_args() + + seed_all(args.seed) + + logger.remove() + logger.add(args.log_dir, level='INFO', mode='w') + + logger.info(f'args : {args}') + + calib_cfg = { + 'name': args.dataset_name, + 'download': False, + 'path': args.data_path, + 'n_samples': args.n_samples, + 'bs': args.bs, + 'seq_len': args.seq_len, + 'preproc': args.preproc, + 'seed': args.seed, + } + + model_config = { + 'type': args.model_type, + 'path': args.model_path, + 'torch_dtype': args.torch_dtype, + } + + model = MODEL_REGISTRY[args.model_type](args.model_path, args.torch_dtype) + + t_model = MODEL_REGISTRY[args.model_type](args.t_model_path, args.torch_dtype) + + if args.online_rotate: + # import gc + + import yaml + from easydict import EasyDict + + with open(args.config_path, 'r') as file: + config = yaml.safe_load(file) + config = EasyDict(config) + + tokenizer = BaseTokenizer(args.model_path, args.tokenizer_mode) + dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib) + calib_data = dataset.get_calib_dataset() + t_model.collect_first_block_input(calib_data) + del calib_data + gc.collect() + torch.cuda.empty_cache() + + blockwise_opt = ALGO_REGISTRY[config.quant.method]( + t_model, config.quant, t_model.get_first_block_input(), config + ) + blockwise_opt.run_block_loop() + t_model = blockwise_opt.model + + for n, m in t_model.model.named_modules(): + if isinstance(m, RotateLinear): + logger.info(m) + if 'down_proj' in n: + down_rotater = m.rotater + else: + o_rotater = m.rotater + + logger.info(t_model) + logger.info(model) + + tokenizer = BaseTokenizer(args.model_path, args.tokenizer_mode) + dataset = BaseDataset(tokenizer.get_tokenizer(), calib_cfg) + + calib_data = dataset.get_calib_dataset() + + model.collect_first_block_input(calib_data) + t_model.collect_first_block_input(calib_data) + + fp_inps = model.get_first_block_input() + t_fp_inps = t_model.get_first_block_input() + + res = {} + t_res = {} + + org_w = {} + trans_w = {} + + wquanter = analysis_quanter( + bit=args.wbit, + symmetric=args.wsym, + granularity=args.wgra, + group_size=args.group_size, + ) + + if not args.w_only: + aquanter = Quantizer(bit=args.abit, symmetric=args.asym, granularity=args.agra) + + def a_qdq(act, module=None): + return aquanter.fake_quant_act_dynamic(act) + + if args.cosine: + params_dict = {} + params_dict['w_qdq'] = wquanter.fake_quant_weight_dynamic + params_dict['a_qdq'] = None if args.w_only else a_qdq + t_model.replace_module_all(FakeQuantLinear, params_dict) + + with torch.no_grad(): + for i in tqdm(range(len(model.blocks))): + block = model.blocks[i] + t_block = t_model.blocks[i] + block.cuda() + t_block.cuda() + + t_hooks = register_hook(t_block, i, args) + t = True + t_fp_inps['data'] = block_forward( + t_block, t_fp_inps['data'], t_fp_inps['kwargs'] + ) + + hooks = register_hook(block, i, args) + t = False + fp_inps['data'] = block_forward(block, fp_inps['data'], fp_inps['kwargs']) + + block.cpu() + + t_block.cpu() + + for h in hooks: + h.remove() + + for t_h in t_hooks: + t_h.remove() + + if args.cosine: + analysis_block_cosine(res, t_res, args) + else: + analysis_block_outlier(res, t_res, org_w, trans_w, args) + + res.clear() + t_res.clear() + org_w.clear() + trans_w.clear() + + gc.collect() + torch.cuda.empty_cache() diff --git a/tools/token_analysis.py b/tools/token_analysis.py new file mode 100644 index 000000000..48fd3dcd7 --- /dev/null +++ b/tools/token_analysis.py @@ -0,0 +1,185 @@ +import gc +from concurrent.futures import ThreadPoolExecutor + +import torch +import torch.nn as nn +from datasets import load_dataset, load_from_disk +from loguru import logger + + +class TokenConsistencyEval: + def __init__(self, tokenizer, eval_cfg): + self.tokenizer = tokenizer + # eval_cfg + logger.info(f'eval_cfg : {eval_cfg}') + self.dataset = eval_cfg['name'] + assert self.dataset in [ + 'wikitext2', + 'c4', + 'ptb', + ], 'Token consistency eval only supports wikitext2, c4, ptb datasets now.' + self.seq_len = eval_cfg['seq_len'] + self.bs = eval_cfg['bs'] + self.path = eval_cfg.get('path', None) + self.download = eval_cfg['download'] + self.inference_per_block = eval_cfg.get('inference_per_block', False) + self.testenc = self.build_data() + + @torch.no_grad() + def build_data(self): + # load data + if self.download: + if self.dataset == 'wikitext2': + testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + elif self.dataset == 'c4': + testdata = load_dataset( + 'allenai/c4', + data_files={ + 'validation': 'en/c4-validation.00000-of-00008.json.gz' + }, + split='validation', + ) + elif self.dataset == 'ptb': + testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + else: + assert self.path, 'Please set path in eval_cfg.' + testdata = load_from_disk(self.path) + + # encode data + if self.dataset == 'wikitext2': + testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt') + elif self.dataset == 'c4': + testenc = self.tokenizer( + ' '.join(testdata[:1100]['text']), return_tensors='pt' + ) + testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)] + elif self.dataset == 'ptb': + testenc = self.tokenizer( + ' '.join(testdata['sentence']), return_tensors='pt' + ) + return testenc + + @torch.no_grad() + def eval(self, model_llmc_1, model_llmc_2): + model1 = model_llmc_1.get_model() + model2 = model_llmc_2.get_model() + + if self.inference_per_block: + handles1 = [] + handles2 = [] + for layer in model_llmc_1.get_blocks(): + handles1.append(layer.register_forward_pre_hook(self.forward_pre_hook)) + handles1.append(layer.register_forward_hook(self.forward_hook)) + for layer in model_llmc_2.get_blocks(): + handles2.append(layer.register_forward_pre_hook(self.forward_pre_hook)) + handles2.append(layer.register_forward_hook(self.forward_hook)) + for layer in model_llmc_1.get_layers_except_blocks(): + layer.cuda() + for layer in model_llmc_2.get_layers_except_blocks(): + layer.cuda() + else: + model1.cuda() + model2.cuda() + + model1.eval() + model2.eval() + + consistency = self.eval_token_consistency(model1, model2, self.testenc, self.seq_len, self.bs) + + if self.inference_per_block: + for h in handles1 + handles2: + h.remove() + + model1.cpu() + model2.cpu() + gc.collect() + torch.cuda.empty_cache() + return consistency + + @torch.no_grad() + def forward_pre_hook(self, m, x): + m.cuda() + + @torch.no_grad() + def forward_hook(self, m, x, y): + with ThreadPoolExecutor() as executor: + executor.submit(self.load_layer_to_cpu, m) + + @torch.no_grad() + def load_layer_to_cpu(self, m): + m.cpu() + + @torch.no_grad() + def eval_token_consistency(self, model1, model2, testenc, seq_len, bs): + testenc = testenc.input_ids + nsamples = testenc.numel() // seq_len + + consistent_tokens = 0 + total_tokens = 0 + + # Loop through each batch + for i in range(0, nsamples, bs): + logger.info(f'index : {(i + 1) // bs}/{nsamples // bs}') + # Calculate end index + j = min(i + bs, nsamples) + + # Prepare inputs and move to gpu + inputs = testenc[:, (i * seq_len): (j * seq_len)].cuda() + inputs = inputs.reshape(j - i, seq_len) + + # Forward pass through the models + logits1 = model1(inputs).logits + logits2 = model2(inputs).logits + + # Get predicted tokens + preds1 = torch.argmax(logits1, dim=-1) + preds2 = torch.argmax(logits2, dim=-1) + + # Compare tokens for consistency + consistent_tokens += (preds1 == preds2).sum().item() + total_tokens += preds1.numel() + + # Calculate consistency ratio + consistency_ratio = consistent_tokens / total_tokens + + # Empty CUDA cache to save memory + testenc.cpu() + torch.cuda.empty_cache() + + return consistency_ratio + + +if __name__ == '__main__': + import sys + + sys.path.append('../../') + import argparse + + from llmc.data import BaseTokenizer + from llmc.models import Llama + from llmc.utils.registry_factory import MODEL_REGISTRY + + parser = argparse.ArgumentParser() + parser.add_argument('--model_type_1', type=str, required=True) + parser.add_argument('--model_path_1', type=str, required=True) + parser.add_argument('--model_type_2', type=str, required=True) + parser.add_argument('--model_path_2', type=str, required=True) + args = parser.parse_args() + + tokenizer = BaseTokenizer(args.model_path_1) + model1 = MODEL_REGISTRY[args.model_type_1](args.model_path_1, 'auto') + model2 = MODEL_REGISTRY[args.model_type_2](args.model_path_2, 'auto') + + # Llama2-70B config example + eval_cfg = { + 'name': 'wikitext2', + 'seq_len': 2048, + 'bs': 20, + 'download': False, + 'path': 'data_path', + 'inference_per_block': True, + } + token_consistency_eval = TokenConsistencyEval(tokenizer.get_tokenizer(), eval_cfg) + + consistency_ratio = token_consistency_eval.eval(model1, model2) + logger.info(f'Token consistency ratio: {consistency_ratio}') From 7994f92e73afc74793a411095172e08eea029064 Mon Sep 17 00:00:00 2001 From: gushiqiao <77222802+gushiqiao@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:04:15 +0800 Subject: [PATCH 2/5] Update spinquant_w4a4.yml --- configs/quantization/SpinQuant/spinquant_w4a4.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/quantization/SpinQuant/spinquant_w4a4.yml b/configs/quantization/SpinQuant/spinquant_w4a4.yml index 0609839d7..6eb46852d 100644 --- a/configs/quantization/SpinQuant/spinquant_w4a4.yml +++ b/configs/quantization/SpinQuant/spinquant_w4a4.yml @@ -53,7 +53,7 @@ train: weight_decay: 0. lr_scheduler_type: "cosine" gradient_checkpointing: True - max_steps: 1 + max_steps: 100 output_dir: output_path logging_dir: your_log_path logging_first_step: True From b7173a77a7a27484a11026618328d913adbe0969 Mon Sep 17 00:00:00 2001 From: gushiqiao <77222802+gushiqiao@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:21:24 +0800 Subject: [PATCH 3/5] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 81be260a6..9797f5aea 100644 --- a/README.md +++ b/README.md @@ -283,6 +283,8 @@ You can add your own model type referring to files under `llmc/models/*.py`. ✅ [QuaRot](https://arxiv.org/abs/2404.00456) +✅ [SpinQuant](https://arxiv.org/abs/2405.16406) + ### Pruning ✅ Naive(Magnitude) From 8736fe0d1fd92710dc1851db4beb50146aa08c68 Mon Sep 17 00:00:00 2001 From: gushiqiao <77222802+gushiqiao@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:21:58 +0800 Subject: [PATCH 4/5] Update README_ja.md --- README_ja.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README_ja.md b/README_ja.md index b093271a8..03ba168a3 100644 --- a/README_ja.md +++ b/README_ja.md @@ -267,6 +267,8 @@ ✅ [QuaRot](https://arxiv.org/abs/2404.00456) +✅ [SpinQuant](https://arxiv.org/abs/2405.16406) + ### 剪定 ✅ Naive(Magnitude) From 15e4a45ca105c84f47e81cb0be6cb93afd60e365 Mon Sep 17 00:00:00 2001 From: gushiqiao <77222802+gushiqiao@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:22:17 +0800 Subject: [PATCH 5/5] Update README_zh.md --- README_zh.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README_zh.md b/README_zh.md index 4732f561a..1f43d219d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -263,6 +263,8 @@ ✅ [QuaRot](https://arxiv.org/abs/2404.00456) +✅ [SpinQuant](https://arxiv.org/abs/2405.16406) + ### 剪枝 ✅ Naive(Magnitude)