From 88834357ab817ee80e4f8843046cac04aedf3aeb Mon Sep 17 00:00:00 2001
From: gushiqiao <gushiqiao@sensetime.com>
Date: Fri, 18 Oct 2024 18:58:56 +0800
Subject: [PATCH 1/5] Add spinquant

---
 README.md                                     | 238 ++++--
 README_ja.md                                  | 266 +++---
 README_zh.md                                  | 291 ++++---
 benchmark/align.md                            |  42 +
 benchmark/calib.md                            |  53 ++
 ci_check/awq_w4a16_fakequant_eval.yml         |   1 -
 ci_check/change_files.py                      |  40 +-
 ci_check/cpu.txt                              |   2 -
 ci_check/run.sh                               |  17 +-
 .../AdaDim/adadim_w8a8_fakequant_eval.yml     |  37 +
 configs/quantization/Awq/awq_w4a16.yml        |  33 +
 .../Awq/awq_w4a16_fakequant_eval.yml          |  32 +
 .../Awq/awq_w4a16_fakequant_eval_general.yml  |  37 +
 ...wq_w4a16_fakequant_eval_general_custom.yml |  36 +
 .../Awq/awq_w4a16_fakequant_trt-llm.yml       |  36 +
 configs/quantization/Awq/awq_w4a4_best.yml    |  52 ++
 configs/quantization/Awq/awq_w4a8_best.yml    |  52 ++
 .../Awq/awq_w8a8_fakequant_eval_general.yml   |  35 +
 .../Awq/awq_w_only_mix_bits_1.yml             |  46 ++
 .../Awq/awq_w_only_mix_bits_2.yml             |  49 ++
 configs/quantization/Awq/awq_wa_mix_bits.yml  |  47 ++
 .../DGQ/dgq_w4a8_fakequant_eval.yml           |  41 +
 .../FP/awq_we2m1a16_128_fakequant_eval.yml    |  33 +
 .../FP/rtn_w4a16_fakequant_eval.yml           |  24 +
 .../FP/rtn_w8a8_fakequant_eval.yml            |  27 +
 .../FP/rtn_we2m1a16_fakequant_eval.yml        |  23 +
 .../FP/rtn_we2m1a16_fakequant_g128_eval.yml   |  24 +
 .../FP/rtn_we2m1ae2m1_fakequant_eval.yml      |  27 +
 .../FP/rtn_we4m3ae4m3_fakequant_eval.yml      |  27 +
 .../FP/rtn_we5m2ae5m2_fakequant_eval.yml      |  27 +
 .../GPTQ/gptq_owq_w4a16_fakequant_eval.yml    |  41 +
 configs/quantization/GPTQ/gptq_quarot.yml     |  51 ++
 .../GPTQ/gptq_w4a16_fakequant_eval.yml        |  39 +
 .../gptq_w4a16_fakequant_eval_general.yml     |  39 +
 .../HQQ/hqq_w4a16_fakequant_eval.yml          |  30 +
 .../LlmInt8/llmint8_w8a8_fakequant_eval.yml   |  38 +
 .../ntweak_llama_w4a16_fakequant_eval.yml     |  38 +
 .../ntweak_llama_w8a8_fakequant_eval.yml      |  42 +
 .../OmniQuant/omniq_llama_w2a16_best.yml      |  51 ++
 .../omniq_llama_w2a16_fakequant_eval.yml      |  49 ++
 .../omniq_llama_w4a16_fakequant_eval.yml      |  49 ++
 .../OmniQuant/omniq_llama_w4a4_best.yml       |  59 ++
 .../OmniQuant/omniq_llama_w4a8_best.yml       |  59 ++
 .../omniq_llama_w8a8_fakequant_eval.yml       |  51 ++
 .../omniq_mistral_w8a8_fakequant_eval.yml     |  49 ++
 .../omniq_opt_w8a8_fakequant_eval.yml         |  49 ++
 .../OsPlus/osplus_llama_w4a4_best.yml         |  46 ++
 .../OsPlus/osplus_llama_w4a8_best.yml         |  46 ++
 ...plus_llama_w8a8_fakequant_eval_general.yml |  36 +
 ...osplus_opt_w8a8_fakequant_eval_general.yml |  36 +
 .../QUIK/quik_w4a4_fakequant_eval.yml         |  41 +
 configs/quantization/QuaRot/quarot_w4a4.yml   |  36 +
 configs/quantization/RTN/rtn_w4a16.yml        |  16 +
 .../RTN/rtn_w4a16_fakequant_eval.yml          |  23 +
 configs/quantization/RTN/rtn_w8a8.yml         |  20 +
 .../RTN/rtn_w8a8_fakequant_eval.yml           |  26 +
 .../RTN/rtn_w8a8_pertensor_static.yml         |  36 +
 .../smoothquant_llama_w8a8_fakequant_eval.yml |  35 +
 ...uant_llama_w8a8_fakequant_eval_general.yml |  35 +
 .../smoothquant_llama_w8a8_trt-llm.yml        |  35 +
 .../smoothquant_opt_w8a8_fakequant_eval.yml   |  35 +
 ...hquant_opt_w8a8_fakequant_eval_general.yml |  35 +
 .../SpQR/spqr_w4a16_fakequant_eval.yml        |  54 ++
 .../quantization/SpinQuant/spinquant_w4a4.yml |  63 ++
 .../sparsification/Magnitude/magnitude.yml    |  30 +
 configs/sparsification/ShortGPT/shortgpt.yml  |  30 +
 configs/sparsification/Wanda/wanda.yml        |  31 +
 docs/en/source/advanced/model_test.md         | 181 +++++
 docs/en/source/configs.md                     | 102 +--
 docs/en/source/index.rst                      |  20 +-
 docs/en/source/quickstart.md                  |  86 +-
 docs/zh_cn/source/advanced/model_test.md      | 180 +++++
 docs/zh_cn/source/configs.md                  | 107 +--
 docs/zh_cn/source/index.rst                   |  19 +-
 docs/zh_cn/source/quickstart.md               |  69 +-
 .../backend/autoawq/infer_with_autoawq.py     |  34 -
 examples/backend/mlcllm/infer_with_mlcllm.py  |  17 -
 examples/backend/sglang/infer_with_sglang.py  |  13 -
 examples/backend/vllm/infer_with_vllm.py      |  21 -
 llmc/__main__.py                              | 240 ++----
 llmc/compression/blockwise_optimization.py    |   6 +-
 llmc/compression/quantization/__init__.py     |   4 +-
 llmc/compression/quantization/awq.py          |  24 +-
 .../base_blockwise_quantization.py            | 287 ++++---
 llmc/compression/quantization/dgq.py          |  31 +-
 llmc/compression/quantization/gptq.py         | 100 ++-
 .../quantization/hadamard_utils.py            | 102 ++-
 llmc/compression/quantization/hqq.py          |  22 +-
 llmc/compression/quantization/llmint8.py      |   4 +-
 llmc/compression/quantization/module_utils.py | 762 +++++++-----------
 llmc/compression/quantization/ntweak.py       |   6 +-
 llmc/compression/quantization/omniq.py        |  15 +-
 llmc/compression/quantization/osplus.py       |   6 +-
 llmc/compression/quantization/quant.py        | 520 ++++--------
 llmc/compression/quantization/quarot.py       |  55 +-
 llmc/compression/quantization/quik.py         |   4 +-
 llmc/compression/quantization/rotate_utils.py | 102 +++
 llmc/compression/quantization/rtn.py          |  25 +-
 llmc/compression/quantization/smoothquant.py  |   8 +-
 llmc/compression/quantization/spinquant.py    | 231 ++++++
 llmc/compression/quantization/spqr.py         |  40 +-
 llmc/compression/quantization/train_utils.py  | 187 +++++
 llmc/compression/quantization/utils.py        |  19 -
 .../base_blockwise_sparsification.py          |   4 +-
 llmc/compression/sparsification/magnitude.py  |   4 +-
 llmc/compression/sparsification/shortgpt.py   |   4 +-
 llmc/compression/sparsification/wanda.py      |   6 +-
 llmc/data/__init__.py                         |   2 +-
 llmc/data/dataset/__init__.py                 |   1 +
 llmc/data/dataset/base_dataset.py             | 287 +------
 llmc/data/dataset/specified_preproc.py        |  88 +-
 llmc/data/dataset/train_dataset.py            |  62 ++
 llmc/data/tokenizer/base_tokenizer.py         |  11 +-
 llmc/eval/__init__.py                         |   2 -
 llmc/eval/eval_ppl.py                         |  93 ++-
 llmc/eval/eval_token.py                       | 185 +++++
 llmc/models/__init__.py                       |  11 -
 llmc/models/base_model.py                     |  78 +-
 llmc/models/bloom.py                          |   7 +-
 llmc/models/falcon.py                         |  10 +-
 llmc/models/gemma2.py                         |  37 +-
 llmc/models/internlm2.py                      |  18 +-
 llmc/models/llama.py                          |  15 +-
 llmc/models/llava.py                          |  22 +-
 llmc/models/mistral.py                        |   7 +-
 llmc/models/mixtral.py                        |  34 +-
 llmc/models/opt.py                            |   7 +-
 llmc/models/qwen2.py                          |  21 +-
 llmc/models/starcoder.py                      |   7 +-
 llmc/utils/__init__.py                        |   5 +-
 llmc/utils/utils.py                           |  14 -
 lm-evaluation-harness                         |   1 -
 requirements/runtime.txt                      |  25 +-
 scripts/run_adadim_llama.sh                   |  15 +
 scripts/run_awq_llama.sh                      |  16 +
 scripts/run_dgq_llama.sh                      |  16 +
 scripts/run_gptq_llama.sh                     |  15 +
 scripts/run_gptq_owq_llama.sh                 |  15 +
 scripts/run_hqq_llama.sh                      |  15 +
 scripts/run_in_tmux_sequence.sh               |  25 +
 scripts/run_llmint8_llama.sh                  |  16 +
 scripts/run_ntweak_llama.sh                   |  16 +
 scripts/run_omniq_llama.sh                    |  16 +
 scripts/run_omniq_mistral.sh                  |  15 +
 scripts/run_omniq_opt.sh                      |  15 +
 scripts/run_osplus_llama.sh                   |  15 +
 scripts/run_osplus_opt.sh                     |  15 +
 scripts/run_quarot_llama.sh                   |  15 +
 scripts/run_quik_llama.sh                     |  15 +
 scripts/run_rtn_llama.sh                      |  15 +
 scripts/run_rtn_llama_static.sh               |  15 +
 scripts/run_shortgpt_llama.sh                 |  15 +
 scripts/run_smoothquant_llama.sh              |  15 +
 scripts/run_smoothquant_opt.sh                |  15 +
 scripts/run_spinquant_llama.sh                |  15 +
 scripts/run_spqr_llama.sh                     |  15 +
 scripts/run_wanda_llama.sh                    |  15 +
 tools/outlier_analysis.py                     | 483 +++++++++++
 tools/token_analysis.py                       | 185 +++++
 159 files changed, 6223 insertions(+), 2711 deletions(-)
 create mode 100644 benchmark/align.md
 create mode 100644 benchmark/calib.md
 create mode 100644 configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/Awq/awq_w4a16.yml
 create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml
 create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml
 create mode 100644 configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml
 create mode 100644 configs/quantization/Awq/awq_w4a4_best.yml
 create mode 100644 configs/quantization/Awq/awq_w4a8_best.yml
 create mode 100644 configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml
 create mode 100644 configs/quantization/Awq/awq_w_only_mix_bits_1.yml
 create mode 100644 configs/quantization/Awq/awq_w_only_mix_bits_2.yml
 create mode 100644 configs/quantization/Awq/awq_wa_mix_bits.yml
 create mode 100644 configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml
 create mode 100644 configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml
 create mode 100644 configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml
 create mode 100644 configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/GPTQ/gptq_quarot.yml
 create mode 100644 configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml
 create mode 100644 configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/OsPlus/osplus_llama_w4a4_best.yml
 create mode 100644 configs/quantization/OsPlus/osplus_llama_w4a8_best.yml
 create mode 100644 configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml
 create mode 100644 configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml
 create mode 100644 configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml
 create mode 100644 configs/quantization/QuaRot/quarot_w4a4.yml
 create mode 100644 configs/quantization/RTN/rtn_w4a16.yml
 create mode 100644 configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/RTN/rtn_w8a8.yml
 create mode 100644 configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/RTN/rtn_w8a8_pertensor_static.yml
 create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml
 create mode 100644 configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml
 create mode 100644 configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml
 create mode 100644 configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml
 create mode 100644 configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml
 create mode 100644 configs/quantization/SpinQuant/spinquant_w4a4.yml
 create mode 100644 configs/sparsification/Magnitude/magnitude.yml
 create mode 100644 configs/sparsification/ShortGPT/shortgpt.yml
 create mode 100644 configs/sparsification/Wanda/wanda.yml
 create mode 100644 docs/en/source/advanced/model_test.md
 create mode 100644 docs/zh_cn/source/advanced/model_test.md
 delete mode 100644 examples/backend/autoawq/infer_with_autoawq.py
 delete mode 100644 examples/backend/mlcllm/infer_with_mlcllm.py
 delete mode 100644 examples/backend/sglang/infer_with_sglang.py
 delete mode 100644 examples/backend/vllm/infer_with_vllm.py
 create mode 100644 llmc/compression/quantization/rotate_utils.py
 create mode 100644 llmc/compression/quantization/spinquant.py
 create mode 100644 llmc/data/dataset/train_dataset.py
 create mode 100644 llmc/eval/eval_token.py
 delete mode 160000 lm-evaluation-harness
 create mode 100644 scripts/run_adadim_llama.sh
 create mode 100644 scripts/run_awq_llama.sh
 create mode 100644 scripts/run_dgq_llama.sh
 create mode 100644 scripts/run_gptq_llama.sh
 create mode 100644 scripts/run_gptq_owq_llama.sh
 create mode 100644 scripts/run_hqq_llama.sh
 create mode 100644 scripts/run_in_tmux_sequence.sh
 create mode 100644 scripts/run_llmint8_llama.sh
 create mode 100644 scripts/run_ntweak_llama.sh
 create mode 100644 scripts/run_omniq_llama.sh
 create mode 100644 scripts/run_omniq_mistral.sh
 create mode 100644 scripts/run_omniq_opt.sh
 create mode 100644 scripts/run_osplus_llama.sh
 create mode 100644 scripts/run_osplus_opt.sh
 create mode 100644 scripts/run_quarot_llama.sh
 create mode 100644 scripts/run_quik_llama.sh
 create mode 100644 scripts/run_rtn_llama.sh
 create mode 100644 scripts/run_rtn_llama_static.sh
 create mode 100644 scripts/run_shortgpt_llama.sh
 create mode 100644 scripts/run_smoothquant_llama.sh
 create mode 100644 scripts/run_smoothquant_opt.sh
 create mode 100644 scripts/run_spinquant_llama.sh
 create mode 100644 scripts/run_spqr_llama.sh
 create mode 100644 scripts/run_wanda_llama.sh
 create mode 100644 tools/outlier_analysis.py
 create mode 100644 tools/token_analysis.py

diff --git a/README.md b/README.md
index fc51e66ab..81be260a6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# LLMC: Towards Accurate and Efficient LLM Compression
+# llmc: Towards Accurate and Efficient LLM Compression
 
 <img src="./imgs/llmc.png" alt="llmc" style="zoom:35%;" />
 
@@ -13,53 +13,20 @@
 
 **\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]**
 
-**LLMC** is an off-the-shell tool designed for compressing LLM, leveraging state-of-the-art compression algorithms to enhance efficiency and reduce model size without compromising performance.
+**llmc** is an off-the-shell tool designed for compressing LLM, leveraging state-of-the-art compression algorithms to enhance efficiency and reduce model size without compromising performance.
 
 **English doc** is [here](https://llmc-en.readthedocs.io/en/latest/).
 
 **Chinese doc** is [here](https://llmc-zhcn.readthedocs.io/en/latest/).
 
-**docker hub** is [here](https://hub.docker.com/r/llmcompression/llmc).
-
-**aliyun docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]`
-
-You can download the Docker image that can run llmc with the following command. Users in mainland China are recommended to use Alibaba Cloud Docker.
-
-docker hub
-
-```
-docker pull llmcompression/llmc:pure-latest
-```
-
-aliyun docker
-
-```
-docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest
-```
-
 **Community**:
 
 - [Discord Server](https://discord.gg/qZKUDfhm)
 - [Tencent QQ Group](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 
-## Latest News
+## News
 
-- **Sep 26, 2024:** 🔥 We now support exporting 💥`FP8 quantized(E4M3, E5M2)` models from 🚀`LLMC` to advanced inference backends such as [VLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang). For detailed usage, please refer to the [VLLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html) and [SGLang documentation](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html).
-
-- **Sep 24, 2024:** 🔥 We have officially released ✅INT4 and ✅INT8 models of ✨`Llama-3.1-405B`, quantized using 🚀`LLMC` in `save_lightllm` mode. You can download the model parameters [here](https://huggingface.co/Dongz/llama31-405b-quant).
-
-- **Sep 23, 2024:** 🔥 We now support exporting ✨`real quantized(INT4, INT8)` models from 🚀`LLMC` to advanced inference backends such as [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), and [MLC-LLM](https://github.com/mlc-ai/mlc-llm) for quantized inference deployment, enabling ✨`reduced memory usage` and ✨`faster inference speeds`.
-  For detailed usage, please refer to the [VLLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html), [SGLang documentation](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html), [AutoAWQ documentation](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html), and [MLC-LLM documentation](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html).
-
-- **Sep 9, 2024:** 🔥 We provide some configs of our best practice towards superior performance (see Best Practice [here](https://llmc-en.readthedocs.io/en/latest/)).
-
-* **Sep 3, 2024:** 🔥 We support [opencompass](https://github.com/open-compass/opencompass) 🤗 to eval 🚀`LLMC` model. Follow this [doc](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v2.html) and have a try!
-
-* **Aug 22, 2024:** 🔥We support lots of small language models, including current SOTA [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)(see [Supported Model List](#supported-model-list)).
-
-* **Aug 22, 2024:** 🔥 Additionally, we also support down stream task evaluation through our modified [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 🤗. Specifically, people can first employ `save_trans` mode(see `save` part in [Configuration](https://llmc-en.readthedocs.io/en/latest/configs.html)) to save a weight modified model. After obtaining the transformed model, they can directly evaluate the quantized model referring to [run_lm_eval.sh](scripts/run_lm_eval.sh). More details can be found in [here](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v1.html).
-
-* **Jul 23, 2024:** 🍺🍺🍺 We release a brand new version benchmark paper:
+- **Jul 23, 2024:** 🍺🍺🍺 We release a brand new version benchmark paper:
 
   [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2).
 
@@ -67,13 +34,21 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
   (\* denotes equal contribution, 📧 denotes corresponding author.)
 
-<details close>
-<summary>Previous News</summary>
+  <div align=center>
+   <img src="./imgs/K.png" alt="comp" width="800" />
+  </div>
+
+  Instead of focusing on the best practice, We modularly and fairly benchmark LLM quantization considering calibration data, algorithms, and data formats. With detailed observation and analysis, we provide various types of novel points for performance and method improvements under different configurations. With the powerful toolkit LLMC and comprehensive insights, future LLM researchers can efficiently integrate suitable algorithms and low-bit formats for their applications, thereby democratizing the compression of large language models.
 
 - **Jul 16, 2024:** 🔥We support Wanda/Naive(Magnitude) for llm sparsification and layer-wise mix bits quantization now!
 
 - **Jul 14, 2024:** 🔥We support rotation based quantization QuaRot now!
 
+- **Jul 4, 2024:** 📱 We open our discussion channel. If you have any questions, please join our community:
+
+  - [Discord Server](https://discord.gg/qZKUDfhm)
+  - [Tencent QQ Group](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
+
 - **May 17, 2024:** 🚀 We support some advanced large models, e.g., LLaVA, Mixtral, LLaMA V3 and Qwen V2 now. Have a try!
 
 - **May 13, 2024:** 🍺🍺🍺 We release our quantization benchmark paper:
@@ -94,23 +69,157 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 - **Mar 7, 2024:** 🚀 We release the quantization part of a powerful and efficient LLM compression tool. Notably, our benchmark paper is coming soon😊.
 
-</details>
-
 ## Highlight Feature
 
-- 💥**Comprehensive Algorithm Support**: Provides a broad range of ✨`SOTA compression algorithms`, including ✅quantization, ✅mixed-precision quantization, and ✅sparsity, while maintaining accuracy consistent with the original repositories. ✨`Quantization best practices` (see 🚀`Best Practices` [here](https://llmc-en.readthedocs.io/en/latest/)) are also available to ensure optimal performance and efficiency.
-
-- 💥**Supported Formats**: Supports both ✨`quantization` (integer and floating-point) and ✨`sparsity`, specifically including ✅weight-activation, ✅weight-only, ✅mixed-precision quantization, as well as ✅structured and ✅unstructured sparsity.
-
-- 💥**Wide Model Support**: Offers support for a diverse array of ✨`LLM models`, including ✅LLama, ✅Mistral, ✅InternLM2, ✅Qwen2, among others, as well as ✅MOE and ✅VLM models (see [Supported Model List](#supported-model-list)).
-
-- 💥**Multi-backend Compatibility**: Seamlessly integrates with various backends for enhanced deployment flexibility. Multiple quantization settings and model formats are compatible with a wide range of backends and hardware platforms, such as ✅VLLM, ✅Sglang, ✅LightLLM, ✅MLC-LLM, and ✅AutoAWQ, making it highly versatile(see Section `Backend` [here](https://llmc-en.readthedocs.io/en/latest/)).
-
-- 💥**Performance Efficiency**: Enables quantization of large LLMs, such as ✨`Llama3.1-405B` and ✨`OPT-175B`, with PPL evaluation on a `single A100/H100/H800 GPU`.
+- Quantize LLMs, e.g., Llama2-70B, OPT-175B,  and evaluate their PPL on only one A100/H100/H800 GPU💥.
+- SOTA compression algorithms [align with the origin repos](benchmark/align.md), for users to choose from, and users can sequentially employ multiple algorithms on one LLM💥.
+- Transformed model (`save_trans`  mode in `quant` part in [Configuration](#configuration)) exported by our tool with a specifical compression algorithm can go through naive quantization by multiple backends, e.g., [Lightllm](https://github.com/ModelTC/lightllm), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) to get a specifical-compression-algorithm-optimized model, which the corresponding backend can infer 💥.
+- Our compressed model (`save_lightllm`  mode in `quant` part in [Configuration](#configuration)) with a shallow memory footprint can be directly inferred by [Lightllm](https://github.com/ModelTC/lightllm)💥.
 
 ## Usage
 
-Please refer to the 🚀`Quick Start` section in the [documentation](https://llmc-en.readthedocs.io/en/latest/).
+1. Clone this repository and install packages:
+
+   ```shell
+   # install packages
+   cd llmc
+   pip install -r requirements.txt
+   ```
+
+2. Prepare models and data.
+
+   ```shell
+   # After downloading LLMs from huggingface, prepare calibration and evaluation data as follows:
+   cd tools
+   python download_calib_dataset.py --save_path [calib data path]
+   python download_eval_dataset.py --save_path [eval data path]
+   ```
+
+3. Choose an algorithm to quantize your model:
+
+   ```shell
+   # Here's an example about Awq:
+   cd scripts
+   # Modify the path of llmc, ``llmc_path``, in the bash file. You can also choose one config
+   # placed in ``llmc/configs/quantization/Awq/`` to quantize your model, or your own
+   # config referring to those we provide by changing the ``--config`` argument in run_awq_llama.sh.
+   bash run_awq_llama.sh
+   ```
+
+## Configuration
+
+To help users design their configs, we now explain some universal configurations in all configs we provide under `llmc/configs/`:
+
+- `model`:
+
+  ```yaml
+  model:
+      # Replace by the name of the class in ``llmc/models/*.py``.
+      type: Llama
+      # Replace by the path of your model.
+      path: model path
+      torch_dtype: auto
+  ```
+
+- `calib`:
+
+  ```yaml
+  # Note: some algorithms do not need ``calib``, like naive... So, you can remove this part.
+  calib:
+      # Replace by the calibration data name, e.g., pileval, c4, wikitext2, or ptb, downloaded before.
+      name: pileval
+      download: False
+      # Replace by the path of one of the calibration data, e.g., pileval, c4, wikitext2, or ptb,
+      # downloaded before.
+      path: calib data path
+      n_samples: 128
+      bs: -1
+      seq_len: 512
+      # Replace by the function name in ``llmc/data/dataset/specified_preproc.py``.
+      preproc: general
+      seed: *seed
+  ```
+
+- `eval`:
+
+  ```yaml
+  # If you want to evaluate PPL of your pretrained/transformed/fake_quant model.
+  eval:
+      # You can evaluate the pretrain, transformed, fake_quant model, and set the position
+      # you want to evaluate.
+      eval_pos: [pretrain, transformed, fake_quant]
+      # Replace by the name of the eval data, e.g., c4, wikitext2, ptb or [c4, wikitext2],
+      # downloaded before.
+      name: wikitext2
+      download: False
+      path: eval data path
+      # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+      # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+      bs: 1
+      inference_per_block: False
+      seq_len: 2048
+  ```
+
+- `save`:
+
+  ```yaml
+  save:
+      # ``save_trans`` is True, which means you want to export the transformed model, e.g., parameter-modified
+      # model, whose performance and structure are the same as the original model, and users can
+      # utilize naive quantization to the transformed model to obtain the same performance as
+      # the specifical-algorithm-quantized model.
+      save_trans: False
+      # ``save_lightllm`` is True, which means you want to export a real quant model, e.g.,
+      # low-bit weights with weight and activation quantization parameters.
+      save_lightllm: False
+      # ``save_fake`` is True means you want to export fake_quant model, e.g.,
+      # dequantized weight with activation quantization parameters.
+      save_fake: False
+      save_path: ./save
+  ```
+
+- `quant`:
+
+  ```yaml
+  quant:
+      # Replace by the class name in ``llmc/compression/quantization/*.py``
+      method: OmniQuant
+      # weight-only quantization does not have ``act`` part.
+      weight:
+          bit: 8
+          symmetric: True
+          # Quantization granularity: per_channel, per_tensor, per_head (not recommended).
+          granularity: per_channel
+          group_size: -1
+          # Calibration algorithms: learnble, mse, and minmax (default).
+          calib_algo: learnable
+          # Utilize Stright-Through Estimation, which is necessary for learnable
+          # calibration algorithms.
+          ste: True
+      act:
+          bit: 8
+          symmetric: True
+          # Quantization granularity: per_token, per_tensor
+          granularity: per_token
+          ste: True
+          # Static quantization (quantization during calibration)or dynamic
+          # quantization (quantization during inference).
+          static: True
+      # This part is designed for specific algorithms, users can refer to
+      # those we provide to design their own.
+      special:
+          let: True
+          lwc_lr: 0.01
+          let_lr: 0.005
+          use_shift: False
+          alpha: 0.5
+          deactive_amp: True
+          epochs: 20
+          wd: 0
+      # If quant_out is True, employ the outputs of the former quantized block as the
+      # calibration data of the proceeding block.
+      quant_out: True
+  ```
 
 ## Supported Model List
 
@@ -138,34 +247,8 @@ Please refer to the 🚀`Quick Start` section in the [documentation](https://llm
 
 ✅ [LLaVA](https://github.com/haotian-liu/LLaVA)
 
-✅ [InternLM2.5](https://huggingface.co/internlm)
-
-✅ [StableLM](https://github.com/Stability-AI/StableLM)
-
-✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2)
-
-✅ [Phi2](https://huggingface.co/microsoft/phi-2)
-
-✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5)
-
-✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM)
-
-✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
-
 You can add your own model type referring to files under `llmc/models/*.py`.
 
-## Supported Backend List
-
-✅ [VLLM](https://github.com/vllm-project/vllm)
-
-✅ [LightLLM](https://github.com/ModelTC/lightllm)
-
-✅ [Sglang](https://github.com/sgl-project/sglang)
-
-✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm)
-
-✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)
-
 ## Supported Algorithm List
 
 ### Quantization
@@ -225,7 +308,6 @@ We develop our code referring to the following repos:
 - https://github.com/mobiusml/hqq
 - [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot)
 - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda)
-- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
 
 ## Star History
 
diff --git a/README_ja.md b/README_ja.md
index ed69b3f0d..b093271a8 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -1,114 +1,209 @@
-# LLMC: 正確で効率的なLLM圧縮に向けて
+# llmc: 正確で効率的なLLM圧縮に向けて
 
 <img src="./imgs/llmc.png" alt="llmc" style="zoom:35%;" />
 
-[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![ライセンス](https://img.shields.io/badge/ライセンス-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![arXiv](https://img.shields.io/badge/LLMC-2405.06001-b31b1b)](https://arxiv.org/abs/2405.06001)
-[![GitHub Stars](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc)
-![visitors](https://komarev.com/ghpvc/?username=llmc&label=visitors)
-[![Discord Banner](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm)
+[![GitHub スター](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc)
+![訪問者](https://komarev.com/ghpvc/?username=llmc&label=visitors)
+[![Discord バナー](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm)
 [![QQ](https://img.shields.io/badge/QQ-EB1923?logo=tencent-qq&logoColor=white)](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 [![Doc](https://img.shields.io/badge/docs-English-99cc2)](https://llmc-en.readthedocs.io/en/latest/)
 [![Doc](https://img.shields.io/badge/文档-中文-99cc2)](https://llmc-zhcn.readthedocs.io/en/latest/)
 
-**\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]**
+**\[ [English](README.md) | [中文](README_zh.md) | 日本語 \]**
 
-**LLMC** は、大規模言語モデル（LLM）の圧縮を目的とした、最新の圧縮アルゴリズムを活用して、パフォーマンスを損なうことなく効率を向上させ、モデルサイズを削減するためのツールです。
+**llmc** は、最先端の圧縮アルゴリズムを活用して、パフォーマンスを損なうことなく効率を向上させ、モデルサイズを削減することを目的とした、オフ・ザ・シェルフのツールです。
 
-**英語のドキュメント**は[こちら](https://llmc-en.readthedocs.io/en/latest/)。
+**英語のドキュメント**は[こちら](https://llmc-en.readthedocs.io/en/latest/)です。
 
-**中国語のドキュメント**は[こちら](https://llmc-zhcn.readthedocs.io/en/latest/)。
-
-**Docker Hub**は[こちら](https://hub.docker.com/r/llmcompression/llmc)。
-
-**aliyun docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]`
-
-以下のコマンドを使用して、llmcを実行できるDockerイメージをダウンロードできます。中国大陸のユーザーは、阿里云Dockerを使用することを推奨します。
-
-docker hub
-
-```
-docker pull llmcompression/llmc:pure-latest
-```
-
-阿里云Docker
-
-```
-docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest
-```
+**中国語のドキュメント**は[こちら](https://llmc-zhcn.readthedocs.io/en/latest/)です。
 
 **コミュニティ**:
 
-- [Discordサーバー](https://discord.gg/qZKUDfhm)
-- [Tencent QQグループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
-
-## 最新情報
-
-- **2024年9月26日:** 🔥 `LLMC`からの✨ `FP8量子化（E4M3、E5M2）`モデルを、VLLMやSGLangのような高度な推理バックエンドにエクスポートできるようになりました。🚀 詳細な使用方法については、[VLLMのドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html)と[SGLangのドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html)を参照してください。
-
-- **2024年9月24日:** 🔥 私たちは正式に ✨`Llama-3.1-405B` の ✅INT4 と ✅INT8 モデルをリリースしました。これらは 🚀`LLMC` の `save_lightllm` モードを使用して量子化されています。モデルパラメータは[こちら](https://huggingface.co/Dongz/llama31-405b-quant)からダウンロードできます。
+- [Discord サーバー](https://discord.gg/qZKUDfhm)
+- [Tencent QQ グループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 
-- **2024年9月23日:** 🔥 私たちは、🚀`LLMC` から ✨`実際の量子化された(INT4, INT8)` モデルを、 [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [MLC-LLM](https://github.com/mlc-ai/mlc-llm) などの高度な推論バックエンドにエクスポートするサポートを追加しました。これにより、✨`メモリ使用量の削減` と ✨`推論速度の向上` が可能になります。
-  詳細については、[VLLMドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html)、[SGLangドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html)、[AutoAWQドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html)、および [MLC-LLMドキュメント](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html) を参照してください。
+## ニュース
 
-- **2024年9月9日:** 🔥 パフォーマンス向上のためのベストプラクティス構成をいくつか提供しています（ベストプラクティスは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください）。
+- **2024 年 7 月 23 日:** 🍺🍺🍺 新しいバージョンのベンチマーク ペーパーをリリースします:
 
-- **2024年9月3日:** 🔥 私たちは、[opencompass](https://github.com/open-compass/opencompass) を使用して 🚀`LLMC` モデルを評価するサポートを提供しています。この[ドキュメント](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v2.html)に従って試してみてください！
+  [**LLMC: 多用途の圧縮ツールキットを使用した大規模言語モデル量子化のベンチマーク**](https://arxiv.org/abs/2405.06001v2)。
 
-- **2024年8月22日:** 🔥私たちは現在のSOTAモデル [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) を含む多くの小型言語モデルをサポートしています（[サポートされているモデルリスト](#supported-model-list)を参照してください）。
+  [Ruihao Gong\*](https://xhplus.github.io/)、[Yang Yong\*](https://github.com/helloyongyang)、[Shiqiao Gu\*](https://github.com/gushiqiao)、[Yushi Huang\*](https://github.com/Harahan)、[Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en)、[Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en)、[Xianglong Liu📧](https://xlliu-beihang.github.io/)、[Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en)
 
-- **2024年8月22日:** 🔥また、修正された [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) を使用した下流タスクの評価もサポートしています 🤗。具体的には、`save_trans` モードを使用して（[構成](https://llmc-en.readthedocs.io/en/latest/configs.html)の `save` 部分を参照）変換されたモデルを保存し、その後、[run_lm_eval.sh](scripts/run_lm_eval.sh) を参照して量子化されたモデルを直接評価できます。詳細は[こちら](https://llmc-en.readthedocs.io/en/latest/advanced/model_test_v1.html)をご覧ください。
+  (\* は同等の貢献、📧 は対応する貢献を表します著者。)
 
-- **2024年7月23日:** 🍺🍺🍺 新しいベンチマーク論文をリリースしました：
-
-  [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2)。
+  <div align=center>
+  <img src="./imgs/K.png" alt="comp" width="800" />
+  </div>
 
-  [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en)
+  ベストプラクティスに焦点を当てるのではなく、キャリブレーションデータ、アルゴリズム、データ形式を考慮して、LLM量子化をモジュール式かつ公平にベンチマークします。詳細な観察と分析により、さまざまな構成でパフォーマンスと方法を改善するためのさまざまなタイプの新しいポイントを提供します。強力なツールキットLLMCと包括的な洞察により、将来のLLM研究者は、アプリケーションに適したアルゴリズムと低ビット形式を効率的に統合し、大規模な言語モデルの圧縮を民主化できます。
 
-  (\*は同等の貢献を示し、📧は対応する著者を示します。)
+- **2024年7月16日:** 🔥現在、llmのスパース化と層間混合ビット量子化のためのWanda/Naive(Magnitude)をサポートしています！
 
-<details close>
-<summary>過去のニュース</summary>
+- **2024年7月14日:** 🔥現在、回転ベースの量子化QuaRotをサポートしています！
 
-- **2024年7月16日:** 🔥私たちはLLMの疎化のためのWanda/Naive（マグニチュード）および層ごとの混合ビット量子化のサポートを追加しました！
+- **2024年7月4日:** 📱 ディスカッションチャンネルを開設しました。質問がある場合は、コミュニティに参加してください:
 
-- **2024年7月14日:** 🔥私たちは回転ベースの量子化 QuaRot のサポートを追加しました！
+  - [Discord サーバー](https://discord.gg/qZKUDfhm)
+  - [Tencent QQ グループ](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgkUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 
-- **2024年5月17日:** 🚀 私たちは、LLaVA、Mixtral、LLaMA V3、Qwen V2などのいくつかの高度な大規模モデルをサポートしています。お試しください！
+- **2024年5月17日:** 🚀 現在、LLaVA、Mixtral、LLaMA V3、Qwen V2などの高度な大規模モデルをサポートしています。試してみてください！
 
-- **2024年5月13日:** 🍺🍺🍺 私たちは量子化ベンチマーク論文をリリースしました：
+- **2024年5月13日:** 🍺🍺🍺 量子化ベンチマーク論文を発表しました:
 
-  [**LLM-QBench: A Benchmark Towards the Best Practice for Post-training Quantization of Large Language Models**](https://arxiv.org/abs/2405.06001)。
+  [**LLM-QBench: 大規模言語モデルのポストトレーニング量子化のベストプラクティスに向けたベンチマーク**](https://arxiv.org/abs/2405.06001).
 
   [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en)
 
-  (\*は同等の貢献を示し、📧は対応する著者を示します。)
+  (\* は同等の貢献を示し、📧 は対応する著者を示します。)
 
   <div align=center>
    <img src="./imgs/best_practice.png" alt="comp" width="800" />
   </div>
 
-  私たちは、校正コスト、推論効率、量子化精度を考慮して、量子化技術を公正にベンチマークしました。さまざまなモデルとデータセットに関して600件近い実験を行い、校正データ、アルゴリズムパイプライン、および量子化構成の選択に関する3つの洞察を得ました。これらの洞察に基づいて、LLMの後処理量子化パイプラインに対するベストプラクティスが設計され、さまざまなシナリオでのパフォーマンスと効率のバランスを実現します。
-
-- **2024年3月7日:** 🚀 私たちは強力で効率的なLLM圧縮ツールの量子化部分をリリースしました。なお、ベンチマーク論文は近日中に公開予定です😊。
+  校正コスト、推論効率、および量子化精度を考慮して、量子化技術をモジュール化し、公平にベンチマークしました。多様なモデルとデータセットでの約600の実験が、校正データ、アルゴリズムパイプライン、および量子化構成の選択に関する3つの洞察を提供します。これらの洞察に基づいて、LLM PTQパイプラインのベストプラクティスが設計され、さまざまなシナリオで最高の精度と効率のパフォーマンスバランスを実現します。
 
-</details>
+- **2024年3月7日:** 🚀 強力で効率的なLLM圧縮ツールの量子化部分をリリースしました。注目すべきは、ベンチマーク論文が近日公開予定です😊。
 
-## 主要機能
+## ハイライト機能
 
-- 💥**包括的なアルゴリズムサポート**: 広範な ✨`SOTA圧縮アルゴリズム` をサポートし、✅量子化、✅混合精度量子化、✅疎性を含み、元のリポジトリと同じ精度を維持します。✨`量子化ベストプラクティス`（ベストプラクティスは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください）も提供されており、最適なパフォーマンスと効率を確保します。
-
-- 💥**サポートされているフォーマット**: ✨`量子化`（整数および浮動小数点）と ✨`疎性` の両方をサポートし、具体的には ✅重量-活性化、✅重量のみ、✅混合精度量子化、および ✅構造化疎性 と ✅非構造化疎性 を含みます。
-
-- 💥**広範なモデルサポート**: 多様な ✨`LLMモデル` をサポートしており、✅LLama、✅Mistral、✅InternLM2、✅Qwen2 など、さらに ✅MOE モデルや ✅VLM モデルもサポートしています（[サポートされているモデルリスト](#supported-model-list)を参照してください）。
-
-- 💥**マルチバックエンドの互換性**: 複数のバックエンドとシームレスに統合し、展開の柔軟性を強化します。さまざまな量子化設定およびモデルフォーマットが、✅VLLM、✅Sglang、✅LightLLM、✅MLC-LLM、✅AutoAWQ など、幅広いバックエンドおよびハードウェアプラットフォームと互換性があり、高い柔軟性を実現しています（`Backend`セクションは[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください）。
-
-- 💥**パフォーマンス効率**: ✨`Llama3.1-405B` や ✨`OPT-175B` などの大規模LLMの量子化をサポートし、`単一の A100/H100/H800 GPU` でPPL評価を可能にします。
+- LLMs（例：Llama2-70B、OPT-175B）を量子化し、1つのA100/H100/H800 GPUでPPLを評価します💥。
+- ユーザーが選択できる最先端の圧縮アルゴリズムが[元のリポジトリと一致](benchmark/align.md)し、ユーザーは1つのLLMで複数のアルゴリズムを順次使用できます💥。
+- 特定の圧縮アルゴリズムでツールによってエクスポートされた変換モデル（[構成](#構成)の`quant`部分の`save_trans`モード）は、複数のバックエンド（例：[Lightllm](https://github.com/ModelTC/lightllm)、[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)）によって単純な量子化を行い、特定の圧縮アルゴリズムで最適化されたモデルを取得できます。対応するバックエンドが推論できます💥。
+- 浅いメモリフットプリントを持つ圧縮モデル（[構成](#構成)の`quant`部分の`save_lightllm`モード）は、[Lightllm](https://github.com/ModelTC/lightllm)によって直接推論できます💥。
 
 ## 使用方法
 
-使用ガイドは 🚀`Quick Start`セクション[こちら](https://llmc-en.readthedocs.io/en/latest/)をご覧ください。
+1. このリポジトリをクローンし、パッケージをインストールします：
+
+   ```shell
+   # パッケージをインストール
+   cd llmc
+   pip install -r requirements.txt
+   ```
+
+2. モデルとデータを準備します。
+
+   ```shell
+   # huggingfaceからLLMをダウンロードした後、次のように校正データと評価データを準備します：
+   cd tools
+   python download_calib_dataset.py --save_path [校正データパス]
+   python download_eval_dataset.py --save_path [評価データパス]
+   ```
+
+3. アルゴリズムを選択してモデルを量子化します：
+
+   ```shell
+   # これはAwqに関する例です：
+   cd scripts
+   # bashファイル内のllmcのパス``llmc_path``を変更します。``llmc/configs/quantization/Awq/``に配置された構成の1つを選択してモデルを量子化するか、run_awq_llama.shの``--config``引数を変更して提供された構成を使用します。
+   bash run_awq_llama.sh
+   ```
+
+## 構成
+
+ユーザーが構成を設計するのを支援するために、`llmc/configs/`の下に提供されているすべての構成のいくつかの一般的な構成を説明します：
+
+- `model`:
+
+  ```yaml
+  model:
+      # ``llmc/models/*.py``のクラス名に置き換えます。
+      type: Llama
+      # モデルのパスに置き換えます。
+      path: model path
+      torch_dtype: auto
+  ```
+
+- `calib`:
+
+  ```yaml
+  # 注意：一部のアルゴリズムには``calib``が必要ありません。例：naive... したがって、この部分を削除できます。
+  calib:
+      # 以前にダウンロードした校正データ名に置き換えます。例：pileval、c4、wikitext2、またはptb。
+      name: pileval
+      download: False
+      # 以前にダウンロードした校正データの1つのパスに置き換えます。例：pileval、c4、wikitext2、またはptb。
+      path: calib data path
+      n_samples: 128
+      bs: -1
+      seq_len: 512
+      # ``llmc/data/dataset/specified_preproc.py``の関数名に置き換えます。
+      preproc: general
+      seed: *seed
+  ```
+
+- `eval`:
+
+  ```yaml
+  # 事前トレーニング/変換/偽量子化モデルのPPLを評価したい場合。
+  eval:
+      # 事前トレーニング、変換、偽量子化モデルを評価し、評価したい位置を設定できます。
+      eval_pos: [pretrain, transformed, fake_quant]
+      # 以前にダウンロードした評価データの名前に置き換えます。例：c4、wikitext2、ptb、または[c4, wikitext2]。
+      name: wikitext2
+      download: False
+      path: eval data path
+      # 70Bモデルの評価の場合、bsを20に設定し、inference_per_blockをTrueに設定できます。
+      # 7B / 13Bモデルの評価の場合、bsを1に設定し、inference_per_blockをFalseに設定できます。
+      bs: 1
+      inference_per_block: False
+      seq_len: 2048
+  ```
+
+- `save`:
+
+  ```yaml
+  save:
+      # ``save_trans``がTrueの場合、変換モデル（例：パラメータが変更されたモデル）をエクスポートしたいことを意味します。パフォーマンスと構造は元のモデルと同じであり、ユーザーは単純な量子化を使用して、特定のアルゴリズムで量子化されたモデルと同じパフォーマンスを得ることができます。
+      save_trans: False
+      # ``save_lightllm``がTrueの場合、実際の量子化モデル（例：低ビットの重みと重みおよびアクティベーションの量子化パラメータ）をエクスポートしたいことを意味します。
+      save_lightllm: False
+      # ``save_fake``がTrueの場合、偽量子化モデル（例：量子化解除された重みとアクティベーションの量子化パラメータ）をエクスポートしたいことを意味します。
+      save_fake: False
+      save_path: ./save
+  ```
+
+- `quant`:
+
+  ```yaml
+  quant:
+      # ``llmc/compression/quantization/*.py``のクラス名に置き換えます。
+      method: OmniQuant
+      # 重みのみの量子化には``act``部分がありません。
+      weight:
+          bit: 8
+          symmetric: True
+          # 量子化の粒度：per_channel、per_tensor、per_head（推奨されません）。
+          granularity: per_channel
+          group_size: -1
+          # 校正アルゴリズム：learnble、mse、およびminmax（デフォルト）。
+          calib_algo: learnable
+          # ストレートスルー推定を使用します。これは、学習可能な校正アルゴリズムに必要です。
+          ste: True
+      act:
+          bit: 8
+          symmetric: True
+          # 量子化の粒度：per_token、per_tensor
+          granularity: per_token
+          ste: True
+          # 静的量子化（校正中の量子化）または動的量子化（推論中の量子化）。
+          static: True
+      # この部分は特定のアルゴリズム用に設計されており、提供されているものを参考にして独自のアルゴリズムを設計できます。
+      special:
+          let: True
+          lwc_lr: 0.01
+          let_lr: 0.005
+          use_shift: False
+          alpha: 0.5
+          deactive_amp: True
+          epochs: 20
+          wd: 0
+      # quant_outがTrueの場合、前の量子化ブロックの出力を次のブロックの校正データとして使用します。
+      quant_out: True
+  ```
 
 ## サポートされているモデルリスト
 
@@ -136,33 +231,7 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 ✅ [LLaVA](https://github.com/haotian-liu/LLaVA)
 
-✅ [InternLM2.5](https://huggingface.co/internlm)
-
-✅ [StableLM](https://github.com/Stability-AI/StableLM)
-
-✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2)
-
-✅ [Phi2](https://huggingface.co/microsoft/phi-2)
-
-✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5)
-
-✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM)
-
-✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
-
-独自のモデルタイプを追加するには、`llmc/models/*.py` ファイルを参照してください。
-
-## サポートされているバックエンドリスト
-
-✅ [VLLM](https://github.com/vllm-project/vllm)
-
-✅ [LightLLM](https://github.com/ModelTC/lightllm)
-
-✅ [Sglang](https://github.com/sgl-project/sglang)
-
-✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm)
-
-✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)
+`llmc/models/*.py`の下のファイルを参照して、独自のモデルタイプを追加できます。
 
 ## サポートされているアルゴリズムリスト
 
@@ -198,9 +267,9 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 ✅ [QuaRot](https://arxiv.org/abs/2404.00456)
 
-### プルーニング（剪定）
+### 剪定
 
-✅ Naive（マグニチュード）
+✅ Naive(Magnitude)
 
 ✅ [Wanda](https://arxiv.org/abs/2306.11695)
 
@@ -223,7 +292,6 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 - https://github.com/mobiusml/hqq
 - [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot)
 - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda)
-- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
 
 ## スター履歴
 
diff --git a/README_zh.md b/README_zh.md
index e8ed8a4a5..4732f561a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -1,114 +1,211 @@
-# LLMC: 准确高效的LLM压缩工具
+# llmc：向精确高效的大型语言模型压缩迈进
 
 <img src="./imgs/llmc.png" alt="llmc" style="zoom:35%;" />
 
-[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![许可证](https://img.shields.io/badge/许可证-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![arXiv](https://img.shields.io/badge/LLMC-2405.06001-b31b1b)](https://arxiv.org/abs/2405.06001)
-[![GitHub Stars](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc)
-![visitors](https://komarev.com/ghpvc/?username=llmc&label=visitors)
+[![GitHub 星标](https://img.shields.io/github/stars/ModelTC/llmc.svg?style=social&label=Star&maxAge=60)](https://github.com/ModelTC/llmc)
 [![Discord Banner](https://img.shields.io/discord/1139835312592392214?logo=discord&logoColor=white)](https://discord.gg/qZKUDfhm)
 [![QQ](https://img.shields.io/badge/QQ-EB1923?logo=tencent-qq&logoColor=white)](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 [![Doc](https://img.shields.io/badge/docs-English-99cc2)](https://llmc-en.readthedocs.io/en/latest/)
 [![Doc](https://img.shields.io/badge/文档-中文-99cc2)](https://llmc-zhcn.readthedocs.io/en/latest/)
 
-**\[ English | [中文](README_zh.md) | [日本語](README_ja.md) \]**
+**\[ [English](https://github.com/ModelTC/llmc?tab=readme-ov-file#llmc-towards-accurate-and-efficient-llm-compression) | 中文 | [日本語](README_ja.md) \]**
 
-**LLMC** 是一个开箱即用的工具，专为压缩LLM设计，利用最先进的压缩算法提高效率并减少模型体积，同时不影响预测精度。
+**llmc** 是一个即插即用的工具，旨在通过最先进的压缩算法进行大型语言模型的压缩，以提高效率并减小模型大小，同时不牺牲性能。
 
-**英文文档**在[此处](https://llmc-en.readthedocs.io/en/latest/)。
+**英文文档**在[这里](https://llmc-en.readthedocs.io/en/latest/).
 
-**中文文档**在[此处](https://llmc-zhcn.readthedocs.io/en/latest/)。
-
-**docker hub**在[此处](https://hub.docker.com/r/llmcompression/llmc)。
-
-**阿里云docker**: `registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:[tag]`
-
-你可以通过以下命令下载可以运行llmc的docker镜像，中国大陆用户推荐使用阿里云docker。
-
-docker hub
-
-```
-docker pull llmcompression/llmc:pure-latest
-```
-
-阿里云docker
-
-```
-docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-latest
-```
+**中文文档**在[这里](https://llmc-zhcn.readthedocs.io/en/latest/).
 
 **社区**:
 
-- [Discord 服务器](https://discord.gg/qZKUDfhm)
-- [腾讯QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
-
-## 最新消息
-
-- **2024年9月26日:** 🔥 我们现在支持从🚀 `LLMC`导出💥 `FP8 量化（E4M3，E5M2）`模型到一些先进的推理后端，例如[VLLM](https://github.com/vllm-project/vllm)和[SGLang](https://github.com/sgl-project/sglang)。关于详细使用方法，请参阅[VLLM文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html)和[SGLang文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html)。
+- [Discord群](https://discord.gg/qZKUDfhm)
+- [QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 
-- **2024年9月24日:** 🔥 我们正式发布了 ✨`Llama-3.1-405B` 的 ✅INT4 和 ✅INT8 模型，这些模型通过 🚀`LLMC` 使用 `save_lightllm` 模式进行量化。你可以在[此处](https://huggingface.co/Dongz/llama31-405b-quant)下载模型参数。
+## 新闻
 
-- **2024年9月23日:** 🔥 我们现在支持从 🚀`LLMC` 导出 ✨`真正量化的(INT4, INT8)` 模型到高级推理后端，例如 [VLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), 和 [MLC-LLM](https://github.com/mlc-ai/mlc-llm) 用于量化推理部署，从而实现 ✨`减少内存使用` 和 ✨`加快推理速度`。
-  详细使用方法，请参考 [VLLM 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html)、[SGLang 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html)、[AutoAWQ 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/autoawq.html) 和 [MLC-LLM 文档](https://llmc-zhcn.readthedocs.io/en/latest/backend/mlcllm.html)。
+- **2024 年 7 月 23 日：** 🍺🍺🍺 我们发布了全新版本的基准论文：
 
-- **2024年9月9日:** 🔥 我们提供了一些最佳实践配置，帮助提升性能（参见最佳实践[此处](https://llmc-zhcn.readthedocs.io/en/latest/)）。
-
-- **2024年9月3日:** 🔥 我们支持通过[opencompass](https://github.com/open-compass/opencompass) 评估 🚀`LLMC` 模型。请参考此[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v2.html)试用！
-
-- **2024年8月22日:** 🔥我们支持许多小型语言模型，包括当前SOTA的 [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)(参见[支持的模型列表](#supported-model-list))。
-
-- **2024年8月22日:** 🔥此外，我们还支持通过我们修改的 [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 进行下游任务评估 🤗。具体操作，用户可以先采用 `save_trans` 模式（参见 [配置](https://llmc-zhcn.readthedocs.io/en/latest/configs.html) 中的 `save` 部分）保存权重修改后的模型。在获得转换模型后，可以直接参考 [run_lm_eval.sh](scripts/run_lm_eval.sh) 对量化模型进行评估。更多细节请见[此处](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v1.html)。
-
-- **2024年7月23日:** 🍺🍺🍺 我们发布了全新的基准论文：
-
-  [**LLMC: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit**](https://arxiv.org/abs/2405.06001v2)。
+  [**LLMC：使用多功能压缩工具包对大型语言模型量化进行基准测试**](https://arxiv.org/abs/2405.06001v2)。
 
   [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Chengtao Lv](https://scholar.google.com/citations?user=r8vseSUAAAAJ&hl=en), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en)
 
   (\* 表示同等贡献，📧 表示通讯作者。)
 
-<details close>
-<summary>历史消息</summary>
+  <div align=center>
+  <img src="./imgs/K.png" alt="comp" width="800" />
+  </div>
 
-- **2024年7月16日:** 🔥我们现在支持 Wanda/Naive（幅度）进行 LLM 稀疏化和逐层混合比特量化！
+  我们不关注最佳实践，而是考虑校准数据、算法和数据格式，以模块化和公平的方式对 LLM 量化进行基准测试。通过详细的观察和分析，我们为不同配置下的性能和方法改进提供了各种类型的新点。借助强大的工具包 LLMC 和全面的见解，未来的 LLM 研究人员可以有效地将合适的算法和低位格式集成到他们的应用中，从而使大型语言模型的压缩变得民主化。
 
-- **2024年7月14日:** 🔥我们现在支持基于旋转的量化 QuaRot！
+- **2024年7月16号：** 🔥我们现在已经支持了大模型稀疏算法Wanda/Naive(Magnitude)和层间混合bit量化!
 
-- **2024年5月17日:** 🚀 我们现在支持一些先进的大型模型，例如 LLaVA、Mixtral、LLaMA V3 和 Qwen V2。快来试试吧！
+- **2024年7月14号：** 🔥我们现在已经支持了旋转类量化算法QuaRot!
+
+- **2024年7月4日:** 📱 我们提供了公开的讨论渠道. 如果您有任何问题，可以加入我们的社区:
+
+  - [Discord群](https://discord.gg/qZKUDfhm)
+  - [QQ群](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=I9IGPWWj8uuRXWH3_ELWjouf6gkIMgUl&authKey=GA3WbFAsm90ePJf%2FCbc7ZyXXq4ShQktlBaLxgqS5yuSPAsr3%2BDKMRdosUiLYoilO&noverify=0&group_code=526192592)
 
 - **2024年5月13日:** 🍺🍺🍺 我们发布了量化基准论文：
 
-  [**LLM-QBench: A Benchmark Towards the Best Practice for Post-training Quantization of Large Language Models**](https://arxiv.org/abs/2405.06001)。
+  [**LLM-QBench：大型语言模型训练后量化的最佳实践基准**](https://arxiv.org/abs/2405.06001).
 
   [Ruihao Gong\*](https://xhplus.github.io/), [Yang Yong\*](https://github.com/helloyongyang), [Shiqiao Gu\*](https://github.com/gushiqiao), [Yushi Huang\*](https://github.com/Harahan), [Yunchen Zhang](https://scholar.google.com/citations?user=glkWFyUAAAAJ&hl=en), [Xianglong Liu📧](https://xlliu-beihang.github.io/), [Dacheng Tao](https://scholar.google.com/citations?user=RwlJNLcAAAAJ&hl=en)
 
-  (\* 表示同等贡献，📧 表示通讯作者。)
+  (\* 表示共同第一作者, 📧 表示通讯作者.)
 
   <div align=center>
    <img src="./imgs/best_practice.png" alt="comp" width="800" />
   </div>
 
-  我们模块化且公平地基准测试了量化技术，考虑了校准成本、推理效率和量化准确性。在多种模型和数据集上进行了近600次实验，得出了三个关于校准数据、算法管道和量化配置选择的有见地的结论。基于这些结论，设计了一种LLM后训练量化管道的最佳实践，以在各种场景下实现最佳的准确性和效率平衡。
-
-- **2024年3月7日:** 🚀 我们发布了一个功能强大且高效的LLM压缩工具的量化部分。值得注意的是，我们的基准论文即将发布😊。
-
-</details>
-
-## 亮点功能
-
-- 💥**综合算法支持**: 提供广泛的 ✨`SOTA压缩算法` 支持，包括 ✅量化、✅混合精度量化 和 ✅稀疏化，同时保持与原始仓库一致的精度。我们还提供 ✨`量化最佳实践`（参见✨`最佳实践` 章节[此处](https://llmc-zhcn.readthedocs.io/en/latest/)），确保最佳性能和效率。
-
-- 💥**支持的格式**: 支持 ✨`量化`（整型和浮点）和 ✨`稀疏化`，具体包括 ✅权重激活量化、✅权重量化、✅混合精度量化，以及 ✅结构化 和 ✅非结构化稀疏化。
-
-- 💥**广泛模型支持**: 支持多种 ✨`LLM模型`，包括 ✅LLama、✅Mistral、✅InternLM2、✅Qwen2 等，以及 ✅MOE 和 ✅VLM 模型（参见[支持的模型列表](#supported-model-list)）。
-
-- 💥**多后端兼容性**: 无缝集成多个后端，增强部署灵活性。多种量化设置和模型格式兼容广泛的后端和硬件平台，例如 ✅VLLM、✅Sglang、✅LightLLM、✅MLC-LLM 和 ✅AutoAWQ，使其高度灵活（参见✨`推理后端` 章节 [此处](https://llmc-zhcn.readthedocs.io/en/latest/)）。
-
-- 💥**性能效率**: 支持大规模LLM的量化，例如 ✨`Llama3.1-405B` 和 ✨`OPT-175B`，并可在 `单个 A100/H100/H800 GPU` 上评估 PPL。
-
-## 使用指南
-
-请参阅 🚀`快速入门`章节[此处](https://llmc-zhcn.readthedocs.io/en/latest/)。
+  我们模块化并公正地基准测试了量化技术，考虑到校准成本、推理效率和量化精度。在多种模型和数据集上进行的近 600 项实验提供了三个洞见：
+  关于校准数据、算法流程和量化配置选择。基于这些洞见，设计了一个最佳的大型语言模型 PTQ 流程，实现了在各种场景下最佳的精确度和效率性能平衡。
+
+- **2024年3月7日:** 🚀 我们发布了强大且高效的大型语言模型压缩工具的量化部分。值得注意的是，我们的基准论文即将发布😊。
+
+## 突出特性
+
+- 量化大型语言模型，如 Llama2-70B、OPT-175B，并在仅一个 A100/H100/H800 GPU上评估其 PPL💥。
+- 为用户提供选择的最新的[与原论文代码仓库精度对齐](benchmark/align.md)的压缩算法，并且用户可以在一个大型语言模型上依次使用多个算法💥。
+- 由我们工具通过特定压缩算法导出的转换模型（`save_trans`模式在`quant`部分的[配置](#配置)）可以通过多个后端进行简单量化，得到经过特定压缩算法优化的模型，相应的后端可以进行推断💥。
+- 我们的压缩模型（`save_lightllm`模式在`quant`部分的\[配置\](#
+
+配置)）具有较低的内存占用，可以直接通过[Lightllm](https://github.com/ModelTC/lightllm)进行推断💥。
+
+## 使用方式
+
+1. 克隆此仓库并安装包：
+
+   ```shell
+   # 安装包
+   cd llmc
+   pip install -r requirements.txt
+   ```
+
+2. 准备模型和数据。
+
+   ```shell
+   # 在从huggingface下载LLM后，按以下方式准备校准和评估数据：
+   cd tools
+   python download_calib_dataset.py --save_path [校准数据路径]
+   python download_eval_dataset.py --save_path [评估数据路径]
+   ```
+
+3. 选择一个算法来量化你的模型：
+
+   ```shell
+   # 这是一个关于 Awq 的例子：
+   cd scripts
+   # 修改 bash 文件中的 llmc 路径，``llmc_path``。你也可以选择``llmc/configs/quantization/Awq/``中的一个配置来量化你的模型，或者通过更改``--config``参数在 run_awq_llama.sh 中使用我们提供的配置。
+   bash run_awq_llama.sh
+   ```
+
+## 配置
+
+为了帮助用户设计他们的配置，我们现在解释我们在`llmc/configs/`下提供的所有配置中的一些通用配置：
+
+- `model`:
+
+  ```yaml
+  model:
+      # 用``llmc/models/*.py``中的类名替换。
+      type: Llama
+      # 用你的模型路径替换。
+      path: model path
+      torch_dtype: auto
+  ```
+
+- `calib`:
+
+  ```yaml
+  # 注意：一些算法不需要``calib``，如 naive... 所以，你可以移除这部分。
+  calib:
+      # 用之前下载的校准数据名称替换，例如，pileval、c4、wikitext2 或 ptb。
+      name: pileval
+      download: False
+      # 用之前下载的某个校准数据的路径替换，例如，pileval、c4、wikitext2 或 ptb。
+      path: calib data path
+      n_samples: 128
+      bs: -1
+      seq_len: 512
+      # 用``llmc/data/dataset/specified_preproc.py``中的函数名称替换。
+      preproc: general
+      seed: *seed
+  ```
+
+- `eval`:
+
+  ```yaml
+  # 如果你想评估你的预训练/转换/假量化模型的 PPL。
+  eval:
+      # 你可以评估预训练、转换、假量化模型，并设置你想要评估的位置。
+      eval_pos: [pretrain, transformed, fake_quant]
+      # 用之前下载的评估数据的名称替换，例如，c4、wikitext2、ptb 或 [c4, wikitext2]。
+      name: wikitext2
+      download: False
+      path: eval data path
+      # 对于 70B 模型评估，bs 可以设置为 20，并且可以将 inference_per_block 设置为 True。
+      # 对于 7B / 13B 模型评估，bs 可以设置为 1，并且可以将 inference_per_block 设置为 False。
+      bs: 1
+      inference_per_block: False
+      seq_len: 2048
+  ```
+
+- `save`:
+
+  ```yaml
+  save:
+      # 如果``save_trans``为 True，这意味着你想要导出转换模型，例如，参数修改的模型，其性能和结构与原始模型相同，用户可以对转换模型进行简单量化，以获得与特定算法量化模型相同的性能。
+      save_trans: False
+      # 如果``save_lightllm``为 True，这意味着你想要导出真实的量化模型，例如，低位权重和权重及激活量化参数。
+      save_lightllm: False
+      # 如果``save_fake``为 True，意味着你想要导出假量化模型，例如，去量化的权重和激活量化参数。
+      save_fake: False
+      save_path: ./save
+
+  ```
+
+- `quant`:
+
+  ```yaml
+  quant:
+      # 用``llmc/compression/quantization/*.py``中的类名替换。
+      method: OmniQuant
+      # 仅权重量化没有``act``部分。
+      weight:
+          bit: 8
+          symmetric: True
+          # 量化粒度：per_channel, per_tensor, per_head（不推荐）。
+          granularity: per_channel
+          group_size: -1
+          # 校准算法：learnble, mse, 以及 minmax（默认）。
+          calib_algo: learnable
+          # 使用直通估计（Stright-Through Estimation），这对于可学习的校准算法是必需的。
+          ste: True
+      act:
+          bit: 8
+          symmetric: True
+          # 量化粒度：per_token, per_tensor
+          granularity: per_token
+          ste: True
+          # 静态量化（校准期间的量化）或动态量化（推理期间的量化）。
+          static: True
+      # 这部分是为特定算法设计的，用户可以参考我们提供的算法来设计他们自己的算法。
+      special:
+          let: True
+          lwc_lr: 0.01
+          let_lr: 0.005
+          use_shift: False
+          alpha: 0.5
+          deactive_amp: True
+          epochs: 20
+          wd: 0
+      # 如果 quant_out 为 True，使用前一个量化块的输出作为后续块的校准数据。
+      quant_out: True
+
+  ```
 
 ## 支持的模型列表
 
@@ -130,39 +227,7 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 ✅ [LLaMA V3](https://huggingface.co/meta-llama)
 
-✅ [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)
-
-✅ [Qwen V2](https://github.com/QwenLM/Qwen2)
-
-✅ [LLaVA](https://github.com/haotian-liu/LLaVA)
-
-✅ [InternLM2.5](https://huggingface.co/internlm)
-
-✅ [StableLM](https://github.com/Stability-AI/StableLM)
-
-✅ [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2)
-
-✅ [Phi2](https://huggingface.co/microsoft/phi-2)
-
-✅ [Phi 1.5](https://huggingface.co/microsoft/phi-1_5)
-
-✅ [MiniCPM](https://github.com/OpenBMB/MiniCPM)
-
-✅ [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
-
-你可以参考 `llmc/models/*.py` 文件添加自己的模型类型。
-
-## 支持的后端列表
-
-✅ [VLLM](https://github.com/vllm-project/vllm)
-
-✅ [LightLLM](https://github.com/ModelTC/lightllm)
-
-✅ [Sglang](https://github.com/sgl-project/sglang)
-
-✅ [MLC-LLM](https://github.com/mlc-ai/mlc-llm)
-
-✅ [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)
+你可以参考 `llmc/models/*.py` 下的文件添加你自己的模型类型。
 
 ## 支持的算法列表
 
@@ -200,13 +265,13 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 ### 剪枝
 
-✅ Naive（Magnitude）
+✅ Naive(Magnitude)
 
 ✅ [Wanda](https://arxiv.org/abs/2306.11695)
 
 ✅ [ShortGPT](https://arxiv.org/abs/2403.03853)
 
-## 鸣谢
+## 致谢
 
 我们的代码参考了以下仓库：
 
@@ -221,15 +286,11 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 - https://github.com/xvyaward/owq
 - https://github.com/TimDettmers/bitsandbytes
 - https://github.com/mobiusml/hqq
-- [https://github.com/spcl/QuaRot](https://github.com/spcl/QuaRot)
 - [https://github.com/locuslab/wanda](https://github.com/locuslab/wanda)
-- [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
-
-## Star 历史
 
-[![Star History Chart](https://api.star-history.com/svg?repos=ModelTC/llmc&type=Timeline)](https://star-history.com/#ModelTC/llmc&Timeline)
+## 星标历史
 
-## 引用
+[![星标历史图表](https://api.star-history.com/svg?repos=ModelTC/llmc&type=Timeline)](https://star-history.com/#ModelTC/llmc&Timeline)
 
 ## 引用
 
diff --git a/benchmark/align.md b/benchmark/align.md
new file mode 100644
index 000000000..295cd22f0
--- /dev/null
+++ b/benchmark/align.md
@@ -0,0 +1,42 @@
+## Alignment with the Original Paper
+
+### The conda environment is consistent with the requirements.txt file and the model is LLama2-7b
+
+### All other configurations are aligned with the original paper/code:
+
+|             | calib_data | seq_len | num_data | seed |
+| ----------- | ---------- | ------- | -------- | ---- |
+| GPTQ        | c4         | 2048    | 128      | 0    |
+| AWQ         | pileval    | 512     | 128      | 42   |
+| Omniquant   | wikitext2  | 2048    | 128      | 2    |
+| Smoothquant | pileval    | 512     | 128      | 42   |
+| Os_plus     | pileval    | 512     | 128      | 42   |
+
+### Results
+
+#### Weight-Only Asymmetric Quantization Results
+
+|                | w4a16g128 | w3a16g128 | w2a16g64 |
+| -------------- | --------- | --------- | -------- |
+| GPTQ           | 5.623     | 6.318     | 14.968   |
+| GPTQ-LLMC      | 5.623     | 6.318     | 14.968   |
+| AWQ            | 5.601     | 6.243     | 2.16e5   |
+| AWQ-LLMC       | 5.601     | 6.238     | 2.16e5   |
+| Omniquant      | 5.590     | 6.092     | 9.525    |
+| Omniquant-LLMC | 5.590     | 6.092     | 9.525    |
+
+#### Weight-Activation Asymmetric Quantization Results
+
+|                | w8a8  | w6a6  | w4a4   |
+| -------------- | ----- | ----- | ------ |
+| Omniquant      | 5.491 | 5.703 | 12.212 |
+| Omniquant-LLMC | 5.490 | 5.703 | 12.239 |
+
+#### Weight-Activation Symmetric Quantization Results
+
+|                  | w8a8  |
+| ---------------- | ----- |
+| SmoothQuant      | 5.589 |
+| SmoothQuant-LLMC | 5.589 |
+| Os_plus          | 5.511 |
+| Os_plus-LLMC     | 5.517 |
diff --git a/benchmark/calib.md b/benchmark/calib.md
new file mode 100644
index 000000000..ddef3c41d
--- /dev/null
+++ b/benchmark/calib.md
@@ -0,0 +1,53 @@
+## Impact of calibration data
+
+### Setting 1: w4a16g128 llama2-7b seq_len=512
+
+#### Calibrate with wikitext2
+
+|           | wikitext2 | c4    | ptb    |
+| --------- | --------- | ----- | ------ |
+| GPTQ      | **5.575** | 7.470 | 63.575 |
+| AWQ       | **5.595** | 7.444 | 35.167 |
+| OmniQuant | **5.586** | 7.455 | 34.192 |
+
+#### Calibrate with c4
+
+|           | wikitext2 | c4        | ptb     |
+| --------- | --------- | --------- | ------- |
+| GPTQ      | 5.615     | **7.443** | 122.070 |
+| AWQ       | 5.596     | **7.436** | 33.148  |
+| OmniQuant | 5.620     | 7.457     | 34.001  |
+
+#### Calibrate with pileval
+
+|           | wikitext2 | c4    | ptb    |
+| --------- | --------- | ----- | ------ |
+| GPTQ      | 5.610     | 7.477 | 136.84 |
+| AWQ       | 5.613     | 7.438 | 33.18  |
+| OmniQuant | 5.618     | 7.458 | 34.526 |
+
+### Setting 2: w3a16g128 llama2-7b seq_len=512
+
+#### Calibrate with wikitext2
+
+|           | wikitext2 | c4    | ptb     |
+| --------- | --------- | ----- | ------- |
+| GPTQ      | **6.133** | 8.696 | 234.977 |
+| AWQ       | **6.138** | 8.272 | 38.86   |
+| OmniQuant | **6.096** | 8.325 | 40.667  |
+
+#### Calibrate with c4
+
+|           | wikitext2 | c4        | ptb     |
+| --------- | --------- | --------- | ------- |
+| GPTQ      | 6.324     | **8.385** | 358.013 |
+| AWQ       | 6.181     | **8.249** | 39.27   |
+| OmniQuant | 6.259     | **8.317** | 41.835  |
+
+#### Calibrate with pileval
+
+|           | wikitext2 | c4    | ptb     |
+| --------- | --------- | ----- | ------- |
+| GPTQ      | 6.330     | 8.534 | 263.279 |
+| AWQ       | 6.217     | 8.284 | 37.117  |
+| OmniQuant | 6.214     | 8.320 | 42.335  |
diff --git a/ci_check/awq_w4a16_fakequant_eval.yml b/ci_check/awq_w4a16_fakequant_eval.yml
index 5f3700f41..c2b0cf5ca 100644
--- a/ci_check/awq_w4a16_fakequant_eval.yml
+++ b/ci_check/awq_w4a16_fakequant_eval.yml
@@ -20,7 +20,6 @@ eval:
     path: /home/runner/work/llmc/llmc/check/datasets/eval/wikitext2
     bs: 1
     seq_len: 16       # 2048
-    eval_token_consist: True
 quant:
     method: Awq
     weight:
diff --git a/ci_check/change_files.py b/ci_check/change_files.py
index c07db9eca..25b69cf7f 100644
--- a/ci_check/change_files.py
+++ b/ci_check/change_files.py
@@ -40,15 +40,7 @@ def main():
                 "modifications": [
                     (
                         "torch.cuda.empty_cache()",
-                        "if use_cuda: torch.cuda.empty_cache()"
-                    ),
-                    (
-                        "init_process_group(backend='nccl')",
-                        "init_process_group(backend='gloo')"
-                    ),
-                    (
-                        "torch.cuda.set_device(int(os.environ['LOCAL_RANK']))",
-                        "# torch.cuda.set_device(int(os.environ['LOCAL_RANK']))"
+                        "if use_cuda: torch.cuda.empty_cache()",
                     )
                 ],
             }
@@ -89,20 +81,6 @@ def main():
                     ),
                 ],
             }
-        elif file_path == "../llmc/eval/eval_base.py":
-            modifications = {
-                "header": [
-                    'device_zbl = "cpu"\n',
-                    'use_cuda = (device_zbl != "cpu")\n',
-                ],
-                "modifications": [
-                    (".cuda()", ".to(device_zbl)"),
-                    (
-                        "torch.cuda.empty_cache()",
-                        "if use_cuda: torch.cuda.empty_cache()",
-                    ),
-                ],
-            }
         elif file_path == "../llmc/eval/eval_ppl.py":
             modifications = {
                 "header": [
@@ -119,22 +97,6 @@ def main():
                     ("nlls = []", "nlls = []; nsamples = nsamples_zbl"),
                 ],
             }
-        elif file_path == "../llmc/eval/eval_token_consist.py":
-            modifications = {
-                "header": [
-                    'device_zbl = "cpu"\n',
-                    'use_cuda = (device_zbl != "cpu")\n',
-                    "nsamples_zbl = 1\n",
-                ],
-                "modifications": [
-                    (".cuda()", ".to(device_zbl)"),
-                    (
-                        "torch.cuda.empty_cache()",
-                        "if use_cuda: torch.cuda.empty_cache()",
-                    ),
-                    ("for i in range(0, nsamples, bs):", "for i in range(0, 1, 1):"),
-                ],
-            }
         else:
             print(f"File {file_path} not recognized or not specified for modification.")
             continue
diff --git a/ci_check/cpu.txt b/ci_check/cpu.txt
index f95571cd9..6eadabb18 100644
--- a/ci_check/cpu.txt
+++ b/ci_check/cpu.txt
@@ -1,7 +1,5 @@
 ../llmc/compression/quantization/base_blockwise_quantization.py
 ../llmc/__main__.py
-../llmc/eval/eval_base.py
-../llmc/eval/eval_token_consist.py
 ../llmc/eval/eval_ppl.py
 ../llmc/compression/quantization/awq.py
 ../llmc/models/base_model.py
\ No newline at end of file
diff --git a/ci_check/run.sh b/ci_check/run.sh
index d5ad5dcb4..24dc9da9f 100644
--- a/ci_check/run.sh
+++ b/ci_check/run.sh
@@ -4,22 +4,7 @@ current_directory=$(pwd)
 llmc=$(echo "$current_directory" | sed 's/\/ci_check$//')
 export PYTHONPATH=$llmc:$PYTHONPATH
 
-config=${llmc}/ci_check/awq_w4a16_fakequant_eval.yml 
-
-nnodes=1
-nproc_per_node=1
-MASTER_ADDR=127.0.0.1
-MASTER_PORT=$((10000 + RANDOM % 20000))
-
-RANDOM=$(python -c 'import uuid; print(uuid.uuid4())')
-task_id=$RANDOM
 
 cd ../scripts
 
-torchrun \
-    --nnodes $nnodes \
-    --nproc_per_node $nproc_per_node \
-    --rdzv_id $task_id \
-    --rdzv_backend c10d \
-    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
-    ${llmc}/llmc/__main__.py --config $config --task_id $task_id \
+python -m llmc --config ../ci_check/awq_w4a16_fakequant_eval.yml 
diff --git a/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml b/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..c88b1e8bd
--- /dev/null
+++ b/configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml
@@ -0,0 +1,37 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: c4
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: c4_gptq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: AdaDim
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a16.yml b/configs/quantization/Awq/awq_w4a16.yml
new file mode 100644
index 000000000..617bc6fa1
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a16.yml
@@ -0,0 +1,33 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    # eval_pos: []
+    eval_pos: [pretrain, transformed]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: True
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..83113f037
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval.yml
@@ -0,0 +1,32 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml
new file mode 100644
index 000000000..246f3596f
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general.yml
@@ -0,0 +1,37 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
+    tokenizer_file_substring: ["token"]
diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml
new file mode 100644
index 000000000..3d5b517dd
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a16_fakequant_eval_general_custom.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: custom
+    download: False
+    load_from_txt: True
+    path: ./inputs.txt
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: random_truncate_txt
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml b/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml
new file mode 100644
index 000000000..c793d25f1
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a16_fakequant_trt-llm.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: []
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_trtllm: True
+    trtllm_cfg:
+        tp_size: 1
+        pp_size: 1
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a4_best.yml b/configs/quantization/Awq/awq_w4a4_best.yml
new file mode 100644
index 000000000..ddf1850b7
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a4_best.yml
@@ -0,0 +1,52 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+        calib_algo: minmax
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: True
+        clip_version: v2
+        save_scale: True
+        scale_path: scale path
+        save_clip: True
+        clip_path: clip path
+save:
+    save_trans: False
+    save_quant: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w4a8_best.yml b/configs/quantization/Awq/awq_w4a8_best.yml
new file mode 100644
index 000000000..bc9761c7c
--- /dev/null
+++ b/configs/quantization/Awq/awq_w4a8_best.yml
@@ -0,0 +1,52 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+    act:
+        bit: 8
+        symmetric: False
+        granularity: per_token
+        calib_algo: minmax
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: True
+        clip_version: v2
+        save_scale: True
+        scale_path: scale path
+        save_clip: True
+        clip_path: clip path
+save:
+    save_trans: False
+    save_quant: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml b/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml
new file mode 100644
index 000000000..c5e449208
--- /dev/null
+++ b/configs/quantization/Awq/awq_w8a8_fakequant_eval_general.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/Awq/awq_w_only_mix_bits_1.yml b/configs/quantization/Awq/awq_w_only_mix_bits_1.yml
new file mode 100644
index 000000000..da018ee82
--- /dev/null
+++ b/configs/quantization/Awq/awq_w_only_mix_bits_1.yml
@@ -0,0 +1,46 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    mix_bits:
+        setting_0:
+            layer_name: [down_proj]
+            do_quant: True
+            weight:
+                bit: 8
+                symmetric: False
+                granularity: per_group
+                group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
+    tokenizer_file_substring: ["token"]
diff --git a/configs/quantization/Awq/awq_w_only_mix_bits_2.yml b/configs/quantization/Awq/awq_w_only_mix_bits_2.yml
new file mode 100644
index 000000000..237ea6410
--- /dev/null
+++ b/configs/quantization/Awq/awq_w_only_mix_bits_2.yml
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    mix_bits:
+        setting_0:
+            layer_name: [down_proj#0-1-2-3-28-29-30-31]
+            do_quant: True
+            weight:
+                bit: 8
+                symmetric: False
+                granularity: per_group
+                group_size: 128
+        setting_1:
+            layer_name: [o_proj]
+            do_quant: False
+save:
+    save_trans: False
+    save_path: ./save
+    tokenizer_file_substring: ["token"]
diff --git a/configs/quantization/Awq/awq_wa_mix_bits.yml b/configs/quantization/Awq/awq_wa_mix_bits.yml
new file mode 100644
index 000000000..0b0fd120e
--- /dev/null
+++ b/configs/quantization/Awq/awq_wa_mix_bits.yml
@@ -0,0 +1,47 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+    mix_bits:
+        setting_0:
+            layer_name: [down_proj]
+            do_quant: True
+            weight:
+                bit: 8
+                symmetric: False
+                granularity: per_channel
+            act:
+                bit: 8
+                symmetric: False
+                granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml b/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml
new file mode 100644
index 000000000..c0bd8af33
--- /dev/null
+++ b/configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml
@@ -0,0 +1,41 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: wikitext2
+    download: False
+    path: calib data path
+    n_samples: 1
+    bs: 1
+    seq_len: 2048
+    preproc: wikitext2_gptq
+    seed: *seed
+eval:
+    eval_pos: []
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: DGQ
+    weight:
+        w_1:
+            bit: 4
+            symmetric: False
+            granularity: per_group
+            group_size: 128
+        w_2:
+            bit: 8
+            symmetric: True
+            granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml b/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml
new file mode 100644
index 000000000..6ee067efa
--- /dev/null
+++ b/configs/quantization/FP/awq_we2m1a16_128_fakequant_eval.yml
@@ -0,0 +1,33 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: pileval_awq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml b/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..fc8663f3a
--- /dev/null
+++ b/configs/quantization/FP/rtn_w4a16_fakequant_eval.yml
@@ -0,0 +1,24 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml b/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..cff96da07
--- /dev/null
+++ b/configs/quantization/FP/rtn_w8a8_fakequant_eval.yml
@@ -0,0 +1,27 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml b/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml
new file mode 100644
index 000000000..fcf0451e0
--- /dev/null
+++ b/configs/quantization/FP/rtn_we2m1a16_fakequant_eval.yml
@@ -0,0 +1,23 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: e2m1
+        symmetric: True
+        granularity: per_channel
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml b/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml
new file mode 100644
index 000000000..493a124f2
--- /dev/null
+++ b/configs/quantization/FP/rtn_we2m1a16_fakequant_g128_eval.yml
@@ -0,0 +1,24 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: e2m1
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml b/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml
new file mode 100644
index 000000000..2938cd11a
--- /dev/null
+++ b/configs/quantization/FP/rtn_we2m1ae2m1_fakequant_eval.yml
@@ -0,0 +1,27 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: e2m1
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: e2m1
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml b/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml
new file mode 100644
index 000000000..ad8e1b935
--- /dev/null
+++ b/configs/quantization/FP/rtn_we4m3ae4m3_fakequant_eval.yml
@@ -0,0 +1,27 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: e4m3
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: e4m3
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml b/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml
new file mode 100644
index 000000000..8e8f3cae0
--- /dev/null
+++ b/configs/quantization/FP/rtn_we5m2ae5m2_fakequant_eval.yml
@@ -0,0 +1,27 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    tokenizer_mode: slow
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: e5m2
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: e5m2
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml b/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..3abfb0b98
--- /dev/null
+++ b/configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml
@@ -0,0 +1,41 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: c4
+    download: False
+    n_samples: 128
+    path: calib data path
+    bs: 1
+    seq_len: 2048
+    preproc: c4_gptq
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: GPTQ
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    special:
+        actorder: False
+        static_groups: False
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+        owq: True
+        n_outs: [6, 6, 6, 6, 2, 2, 6] #target bit is 4.01
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/GPTQ/gptq_quarot.yml b/configs/quantization/GPTQ/gptq_quarot.yml
new file mode 100644
index 000000000..c753eea4a
--- /dev/null
+++ b/configs/quantization/GPTQ/gptq_quarot.yml
@@ -0,0 +1,51 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    n_samples: 128
+    path: valib data path
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    seq_len: 2048
+quant:
+    method: GPTQ
+    weight:
+        bit: 6
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        qmax_to_tensor: True
+        calib_algo: minmax
+    act:
+        bit: 6
+        symmetric: False
+        granularity: per_token
+        qmax_to_tensor: True
+        calib_algo: minmax
+    special:
+        actorder: True
+        static_groups: True
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+        online_rotate: True
+        fp32_had: True
+    quant_out: True
+save:
+    save_trans: False
+    save_fake:  False
+    save_path: ./save
diff --git a/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..dc5eadef8
--- /dev/null
+++ b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval.yml
@@ -0,0 +1,39 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: c4
+    download: False
+    n_samples: 128
+    path: calib data path
+    bs: 1
+    seq_len: 2048
+    preproc: c4_gptq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: GPTQ
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    special:
+        actorder: True
+        static_groups: False
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml
new file mode 100644
index 000000000..ea4abee7f
--- /dev/null
+++ b/configs/quantization/GPTQ/gptq_w4a16_fakequant_eval_general.yml
@@ -0,0 +1,39 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: GPTQ
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    special:
+        actorder: True
+        static_groups: True
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml b/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..5083eb983
--- /dev/null
+++ b/configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml
@@ -0,0 +1,30 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: HQQ
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+        round_zp: False
+    special:
+        axis : 0
+        lp_norm : 0.7
+        beta : 10
+        kappa : 1.01
+        iters : 20
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml b/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..d2a5d66d8
--- /dev/null
+++ b/configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml
@@ -0,0 +1,38 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: LlmInt8
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    special:
+        threshold: 6.0
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml b/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..e3a211505
--- /dev/null
+++ b/configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml
@@ -0,0 +1,38 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: NormTweaking
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    special:
+        ntweak_lr: 0.000001
+        deactive_amp: False
+        epochs: 50
+        gamma: 0.001
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml b/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..9624d52ce
--- /dev/null
+++ b/configs/quantization/NormTweaking/ntweak_llama_w8a8_fakequant_eval.yml
@@ -0,0 +1,42 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: NormTweaking
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    special:
+        ntweak_lr: 0.000001
+        deactive_amp: True
+        epochs: 50
+        gamma: 0.001
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml b/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml
new file mode 100644
index 000000000..511aaf734
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w2a16_best.yml
@@ -0,0 +1,51 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: transformed model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 2
+        symmetric: False
+        granularity: per_group
+        group_size: 64
+        calib_algo: learnable
+        ste: True
+    special:
+        aug_loss: True
+        lwc: True
+        let: False
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 5
+        wd: 0
+        search_clip_init: True
+        search_scale_init: True
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml
new file mode 100644
index 000000000..9840f97df
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w2a16_fakequant_eval.yml
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 2
+        symmetric: False
+        granularity: per_group
+        group_size: 64
+        calib_algo: learnable
+        ste: True
+    special:
+        aug_loss: True
+        lwc: True
+        let: False
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 40
+        wd: 0
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..bbd03e56f
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w4a16_fakequant_eval.yml
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+        calib_algo: learnable
+        ste: True
+    special:
+        aug_loss: False
+        lwc: True
+        let: False
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 20
+        wd: 0
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml b/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml
new file mode 100644
index 000000000..9a8bae2f2
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w4a4_best.yml
@@ -0,0 +1,59 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: transformed model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        calib_algo: learnable
+        ste: True
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+        ste: True
+    special:
+        aug_loss: False
+        lwc: True
+        let: True
+        lwc_lr: 0.001
+        let_lr: 0.001
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 5
+        wd: 0
+        search_clip_init: True
+        load_clip: True
+        search_scale_init: True
+        scale_path: scale path
+        clip_path: clip path
+    quant_out: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml b/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml
new file mode 100644
index 000000000..267e509cb
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w4a8_best.yml
@@ -0,0 +1,59 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    # For 70B model eval, bs can be set to 20, and inference_per_block can be set to True.
+    # For 7B / 13B model eval, bs can be set to 1, and inference_per_block can be set to False.
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        calib_algo: learnable
+        ste: True
+    act:
+        bit: 8
+        symmetric: False
+        granularity: per_token
+        ste: True
+    special:
+        aug_loss: False
+        lwc: True
+        let: True
+        lwc_lr: 0.001
+        let_lr: 0.001
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 5
+        wd: 0
+        search_clip_init: True
+        load_clip: True
+        search_scale_init: True
+        scale_path: scale path
+        clip_path: clip path
+    quant_out: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..5912c1c6d
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml
@@ -0,0 +1,51 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+        ste: True
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+        ste: True
+    special:
+        aug_loss: False
+        let: True
+        lwc: True
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 20
+        wd: 0
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..64a35e770
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: Mistral
+    path: models/mistral/
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: llmc/cali_data/pileval/
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: llmc/eval_data/
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+        ste: True
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+        ste: True
+    special:
+        let: True
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: False
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 20
+        wd: 0
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml b/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..e88cfea78
--- /dev/null
+++ b/configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml
@@ -0,0 +1,49 @@
+base:
+    seed: &seed 42
+model:
+    type: Opt
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OmniQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+        ste: True
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+        ste: True
+    special:
+        let: True
+        lwc_lr: 0.01
+        let_lr: 0.005
+        use_shift: True
+        alpha: 0.5
+        deactive_amp: True
+        epochs: 20
+        wd: 0
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml b/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml
new file mode 100644
index 000000000..12d4b66e3
--- /dev/null
+++ b/configs/quantization/OsPlus/osplus_llama_w4a4_best.yml
@@ -0,0 +1,46 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 1
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OsPlus
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+    special:
+        use_shift: False
+        weight_clip: True
+        save_scale: True
+        scale_path: scale path
+        save_clip: True
+        clip_path: clip path
+        clip_version: v2
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml b/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml
new file mode 100644
index 000000000..b5c3a6723
--- /dev/null
+++ b/configs/quantization/OsPlus/osplus_llama_w4a8_best.yml
@@ -0,0 +1,46 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 1
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OsPlus
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        calib_algo: learnable
+    act:
+        bit: 8
+        symmetric: False
+        granularity: per_token
+    special:
+        use_shift: False
+        weight_clip: True
+        save_scale: True
+        scale_path: scale path
+        save_clip: True
+        clip_path: clip path
+        clip_version: v2
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml b/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml
new file mode 100644
index 000000000..4cd1dda47
--- /dev/null
+++ b/configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 1
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OsPlus
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml b/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml
new file mode 100644
index 000000000..7514785a7
--- /dev/null
+++ b/configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 42
+model:
+    type: Opt
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 1
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: OsPlus
+    weight:
+        bit: 8
+        symmetric: False
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: False
+        granularity: per_token
+    quant_out: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml b/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml
new file mode 100644
index 000000000..31d9f36a2
--- /dev/null
+++ b/configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml
@@ -0,0 +1,41 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: c4
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: c4_gptq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: QUIK
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: False
+        granularity: per_token
+    special:
+        fp_relative: False
+        fp_features: 256
+        fp_threshold: 0.0
+        last_fc_bit: 8
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/QuaRot/quarot_w4a4.yml b/configs/quantization/QuaRot/quarot_w4a4.yml
new file mode 100644
index 000000000..0c037d762
--- /dev/null
+++ b/configs/quantization/QuaRot/quarot_w4a4.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model_path
+    torch_dtype: auto
+eval:
+    eval_pos: [transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    seq_len: 2048
+quant:
+    method: Quarot
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        qmax_to_tensor: True
+        calib_algo: minmax
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+        qmax_to_tensor: True
+    special:
+        rotate_mode: hadamard
+        fp32_had: True
+        online_rotate: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: ./save
diff --git a/configs/quantization/RTN/rtn_w4a16.yml b/configs/quantization/RTN/rtn_w4a16.yml
new file mode 100644
index 000000000..d9ef3b399
--- /dev/null
+++ b/configs/quantization/RTN/rtn_w4a16.yml
@@ -0,0 +1,16 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+quant:
+    method: RTN
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save_w4a16
diff --git a/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml b/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..ffc3e6875
--- /dev/null
+++ b/configs/quantization/RTN/rtn_w4a16_fakequant_eval.yml
@@ -0,0 +1,23 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: [wikitext2, c4, ptb]
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/RTN/rtn_w8a8.yml b/configs/quantization/RTN/rtn_w8a8.yml
new file mode 100644
index 000000000..d0315d8b7
--- /dev/null
+++ b/configs/quantization/RTN/rtn_w8a8.yml
@@ -0,0 +1,20 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save_w8a8
diff --git a/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml b/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..a516c2b8c
--- /dev/null
+++ b/configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml
@@ -0,0 +1,26 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml b/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml
new file mode 100644
index 000000000..fe0e44365
--- /dev/null
+++ b/configs/quantization/RTN/rtn_w8a8_pertensor_static.yml
@@ -0,0 +1,36 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_tensor
+        static: True
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..24f961bb3
--- /dev/null
+++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 512
+    bs: 1
+    seq_len: 512
+    preproc: pileval_smooth
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SmoothQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml
new file mode 100644
index 000000000..98fd8937c
--- /dev/null
+++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 512
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SmoothQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml
new file mode 100644
index 000000000..a6ff9c645
--- /dev/null
+++ b/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_trt-llm.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 512
+    bs: 1
+    seq_len: 512
+    preproc: pileval_smooth
+    seed: *seed
+eval:
+    eval_pos: []
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SmoothQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: True
+    save_path: ./save
diff --git a/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml
new file mode 100644
index 000000000..8ce109fe6
--- /dev/null
+++ b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Opt
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 512
+    bs: 1
+    seq_len: 512
+    preproc: pileval_smooth
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SmoothQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml
new file mode 100644
index 000000000..81e55b0c8
--- /dev/null
+++ b/configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval_general.yml
@@ -0,0 +1,35 @@
+base:
+    seed: &seed 42
+model:
+    type: Opt
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 512
+    bs: 1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [pretrain, transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SmoothQuant
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+save:
+    save_trans: False
+    save_path: ./save
diff --git a/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml b/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml
new file mode 100644
index 000000000..5d0dbea08
--- /dev/null
+++ b/configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml
@@ -0,0 +1,54 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: c4
+    download: False
+    n_samples: 128
+    path: calib data path
+    bs: 1
+    seq_len: 2048
+    preproc: c4_gptq
+    seed: *seed
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+quant:
+    method: SpQR
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 16
+        round_zp: False
+    special:
+        actorder: True
+        percdamp: 1
+        blocksize: 128
+        true_sequential: True
+        relative_threshold: 0.2
+        simplified_outliers: False
+        scale:
+            bit: 3
+            symmetric: False
+            granularity: per_group
+            group_size: 16
+            round_zp: False
+        zero:
+            bit: 3
+            symmetric: False
+            granularity: per_group
+            group_size: 16
+            round_zp: False
+    quant_out: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: ./save
diff --git a/configs/quantization/SpinQuant/spinquant_w4a4.yml b/configs/quantization/SpinQuant/spinquant_w4a4.yml
new file mode 100644
index 000000000..0609839d7
--- /dev/null
+++ b/configs/quantization/SpinQuant/spinquant_w4a4.yml
@@ -0,0 +1,63 @@
+base:
+    seed: &seed 0
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+eval:
+    eval_pos: [transformed, fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    inference_per_block: False
+    seq_len: 2048
+quant:
+    method: SpinQuant
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_channel
+        group_size: -1
+        qmax_to_tensor: True
+        calib_algo: mse
+        ste: True
+    act:
+        bit: 4
+        symmetric: False
+        granularity: per_token
+        qmax_to_tensor: True
+        ste: True
+    special:
+        rotate_mode: hadamard
+        fp32_had: True
+        online_rotate: True
+train:
+    data:
+        name: wikitext2
+        download: False
+        path: calib data path
+        n_samples: 800
+        bs: 1
+        seq_len: 2048
+        preproc: wikitext2_gptq
+        seed: *seed
+        cache_dir: None
+    train_args:
+        fp16: False
+        bf16: True
+        log_on_each_node: False
+        per_device_train_batch_size: 1
+        logging_steps: 1
+        learning_rate: 1.5
+        weight_decay: 0.
+        lr_scheduler_type: "cosine"
+        gradient_checkpointing: True
+        max_steps: 1
+        output_dir: output_path
+        logging_dir: your_log_path
+        logging_first_step: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: ./save
diff --git a/configs/sparsification/Magnitude/magnitude.yml b/configs/sparsification/Magnitude/magnitude.yml
new file mode 100644
index 000000000..e4b8957dc
--- /dev/null
+++ b/configs/sparsification/Magnitude/magnitude.yml
@@ -0,0 +1,30 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [transformed]
+    name: wikitext2
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+sparse:
+    method: Magnitude
+    weight:
+        sparsity: 0.5
+save:
+    save_fp: False
+    save_lightllm: False
+    save_path: ./save
diff --git a/configs/sparsification/ShortGPT/shortgpt.yml b/configs/sparsification/ShortGPT/shortgpt.yml
new file mode 100644
index 000000000..f651e92a4
--- /dev/null
+++ b/configs/sparsification/ShortGPT/shortgpt.yml
@@ -0,0 +1,30 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [transformed]
+    name: [wikitext2, c4]
+    download: False
+    path: eval data path
+    seq_len: 2048
+sparse:
+    method: ShortGPT
+    weight:
+        n_prune_layers: 9
+save:
+    save_trans: True
+    save_fp: False
+    save_lightllm: False
+    save_path: ./save
diff --git a/configs/sparsification/Wanda/wanda.yml b/configs/sparsification/Wanda/wanda.yml
new file mode 100644
index 000000000..a768242af
--- /dev/null
+++ b/configs/sparsification/Wanda/wanda.yml
@@ -0,0 +1,31 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: general
+    seed: *seed
+eval:
+    eval_pos: [transformed]
+    name: [wikitext2, c4]
+    download: False
+    path: eval data path
+    bs: 1
+    seq_len: 2048
+sparse:
+    method: Wanda
+    weight:
+        sparsity: 0.5
+    sparsity_out: False
+save:
+    save_fp: False
+    save_lightllm: False
+    save_path: ./save
diff --git a/docs/en/source/advanced/model_test.md b/docs/en/source/advanced/model_test.md
new file mode 100644
index 000000000..76dc06f95
--- /dev/null
+++ b/docs/en/source/advanced/model_test.md
@@ -0,0 +1,181 @@
+# Model accuracy test
+
+## Accuracy test pipeline
+
+LLMC supports basic PPL (Perplexity) evaluation, but more downstream task evaluations are not supported by LLMC itself.
+
+It is common practice to use evaluation tools to directly test the inference of the model, including but not limited to:
+
+1. [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
+
+2. [opencompass](https://github.com/open-compass/opencompass)
+
+However, this evaluation method is not efficient, so we recommend using the inference engine evaluation tool to separate the model accuracy evaluation, the model is inferred by the inference engine, and served in the form of an API, and the evaluation tool evaluates the API. This approach has the following benefits:
+
+
+1. Using an efficient inference engine for model inference can speed up the entire evaluation process
+
+2. The reasoning of the model and the evaluation of the model are separated, and each is responsible for its own professional affairs, and the code structure is clearer
+
+3. Using the inference engine to infer a model is more in line with the actual deployment scenario and easier to align with the accuracy of the actual deployment of the model
+
+We recommend and introduce the compression-deployment-evaluation process using the following model: **LLMC compression-lightllm inference-opencompass evaluation**
+
+
+Here are the links to the relevant tools:
+
+1. llmc, Large language Model Compression Tool, [(GitHub)(https://github.com/ModelTC/llmc), [Doc](https://llmc-zhcn.readthedocs.io/en/latest/)]
+
+2. Lightllm, Large language Model Inference Engine, [[GitHub](https://github.com/ModelTC/lightllm)]
+
+3. OpenCompass, Large language Model Evaluation Tool, [[GitHub]((https://github.com/open-compass/opencompass)), [Doc](https://opencompass.readthedocs.io/zh-cn/latest/)]
+
+## Use of the lightLLM inference engine
+
+The official [lightllm](https://github.com/ModelTC/llmc) repository has more detailed documentation, but here is a simple and quick start
+
+<font color=792ee5> start a service of a float model </font>
+
+**install lightllm**
+
+```
+git clone https://github.com/ModelTC/lightllm.git
+cd lightllm
+pip install -v -e .
+```
+
+**start a service**
+
+```
+python -m lightllm.server.api_server --model_dir # model path       \
+                                     --host 0.0.0.0                 \
+                                     --port 1030                    \
+                                     --nccl_port 2066               \
+                                     --max_req_input_len 6144       \
+                                     --max_req_total_len 8192       \
+                                     --tp 2                         \
+                                     --trust_remote_code            \
+                                     --max_total_token_num 120000
+```
+
+The above command will serve a 2-card on port 1030 of the machine
+
+The above commands can be set by the number of tp, and TensorParallel inference can be performed on tp cards, which is suitable for inference of larger models.
+
+The max_total_token_num in the above command will affect the throughput performance during the test, and can be set according to the lightllm [documentation](https://github.com/ModelTC/lightllm/blob/main/docs/ApiServerArgs.md). As long as the gpu memory is not exploded, the larger the setting, the better.
+
+If you want to set up multiple lightllm services on the same machine, you need to reset the port and nccl_port above without conflicts.
+
+<font color=792ee5> Simple testing of the service </font>
+
+Execute the following python script
+
+```
+import requests
+import json
+
+url = 'http://localhost:1030/generate'
+headers = {'Content-Type': 'application/json'}
+data = {
+    'inputs': 'What is AI?',
+    "parameters": {
+        'do_sample': False,
+        'ignore_eos': False,
+        'max_new_tokens': 128,
+    }
+}
+response = requests.post(url, headers=headers, data=json.dumps(data))
+if response.status_code == 200:
+    print(response.json())
+else:
+    print('Error:', response.status_code, response.text)
+```
+
+If the above script returns normally, the service is normal
+
+<font color=792ee5> start a service of a quantization model </font>
+
+```
+python -m lightllm.server.api_server --model_dir 模型路径            \
+                                     --host 0.0.0.0                 \
+                                     --port 1030                    \
+                                     --nccl_port 2066               \
+                                     --max_req_input_len 6144       \
+                                     --max_req_total_len 8192       \
+                                     --tp 2                         \
+                                     --trust_remote_code            \
+                                     --max_total_token_num 120000   \
+                                     --mode triton_w4a16
+```
+
+Added to the command `--mode triton_w4a16`, indicates that the naive quantization of w4a16 was used
+
+After the service is started, you also need to verify whether the service is normal
+
+The model path used by the above command is the original pre-trained model and has not been adjusted by the llmc. You can follow the LLMC documentation, open the save_trans, save a modified model, and then run the naive quantization service command described above.
+
+## Use of the opencompass evaluation tool
+
+The official [opencompass](https://github.com/open-compass/opencompass) repository has more detailed documentation, but here is a simple and quick start
+
+**install opencompass**
+
+```
+git clone https://github.com/open-compass/opencompass.git
+cd opencompass
+pip install -v -e .
+```
+
+**Modify the config**
+
+The config file is [here](https://github.com/open-compass/opencompass/blob/main/configs/eval_lightllm.py), this configuration file is used by OpenCompass to evaluate the accuracy of Lightllm's API service, and it should be noted that the port inside it url should be consistent with the above Lightllm service port
+
+
+For the selection of the evaluation dataset, you need to modify this part of the code
+
+```
+with read_base():
+    from .summarizers.leaderboard import summarizer
+    from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
+```
+
+The above code snippet, which represents the test humaneval dataset, can be found here for more dataset testing support
+
+**Dataset download**
+
+It is necessary to prepare the best dataset according to the OpenCompass [documentation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html#dataset-preparation).
+
+**Run accuracy tests**
+
+After modifying the above configuration file, you can run the following command
+```
+python run.py configs/eval_lightllm.py
+```
+When the model has completed the inference and metric calculations, we can get the evaluation results of the model. The output folder will be generated in the current directory, the logs subfolder will record the logs in the evaluation, and the summary subfile will record the accuracy of the measured data set
+
+## FAQ
+
+**<font color=red> Q1 </font>** 
+
+What does the dataset configuration file in OpenCompass mean when the same dataset has different suffixes?
+
+**<font color=green> Solution </font>** 
+
+Different suffixes represent different prompt templates, and for detailed OpenCompass questions, please refer to the OpenCompass documentation
+
+**<font color=red> Q2 </font>** 
+
+The test accuracy of the Humaneval of the LLAMA model is too low
+
+**<font color=green> Solution </font>** 
+
+You may need to delete the \n at the end of each entry in the Humaneval jsonl file in the dataset provided by OpenCompass and retest it
+
+**<font color=red> Q3 </font>** 
+
+The test is still not fast enough
+
+**<font color=green> Solution </font>** 
+
+You can consider whether the max_total_token_num parameter settings are reasonable when starting the lightllm service, and if the setting is too small, the test concurrency will be low
+
diff --git a/docs/en/source/configs.md b/docs/en/source/configs.md
index eaf3baaff..2a2c0f440 100644
--- a/docs/en/source/configs.md
+++ b/docs/en/source/configs.md
@@ -8,46 +8,39 @@ Here's a brief config example
 base:
     seed: &seed 42 # Set random seed
 model:
-    type: model_type # Type of the model
-    path: model path # Path to the model
-    tokenizer_mode: fast # Type of the model's tokenizer
-    torch_dtype: auto # Data type of the model
+    type: Llama # Type of model
+    path: model path # Model path
+    tokenizer_mode: fast # The tokenizer type of the model
+    torch_dtype: auto # Model dtype
 calib:
-    name: pileval # Name of the calibration dataset
-    download: False # Whether to download the calibration dataset online
-    path: calib data path # Path to the calibration dataset
-    n_samples: 512 # Number of samples in the calibration dataset
-    bs: 1 # Batch size for the calibration dataset
-    seq_len: 512 # Sequence length for the calibration dataset
-    preproc: pileval_smooth # Preprocessing method for the calibration dataset
-    seed: *seed # Random seed for the calibration dataset
+    name: pileval # Calibration data set name
+    download: False # Whether the calibration dataset can be downloaded online
+    path: calib data path # Calibration dataset path
+    n_samples: 512 # Number of calibration samples
+    bs: 1 # Batch size of calibration dataset
+    seq_len: 512 # Sequence length of calibration dataset
+    preproc: pileval_smooth # Pre-procession of the calibration dataset
+    seed: *seed # Random seed for calibration dataset
 eval:
-    eval_pos: [pretrain, transformed, fake_quant] # Evaluation points
-    name: wikitext2 # Name of the evaluation dataset
-    download: False # Whether to download the evaluation dataset online
-    path: eval data path # Path to the evaluation dataset
-    bs: 1 # Batch size for the evaluation dataset
-    seq_len: 2048 # Sequence length for the evaluation dataset
-    eval_token_consist: False # Whether to evaluate the consistency of tokens between the quantized and original models
+    eval_pos: [pretrain, transformed, fake_quant] # eval positon
+    name: wikitext2 # The name of the evaluation dataset
+    download: False # Whether the evaluation dataset can be downloaded online
+    path: eval data path # Path to evaluation dataset
+    bs: 1 # The batch size of the evaluation dataset
+    seq_len: 2048 # Sequence length of the evaluation dataset
 quant:
     method: SmoothQuant # Compression method
     weight:
-        bit: 8 # Number of quantization bits for weights
-        symmetric: True # Whether weight quantization is symmetric
-        granularity: per_channel # Granularity of weight quantization
+        bit: 8 # The number of quantified bits of the weight
+        symmetric: True # Is weight quantization a symmetric quantization
+        granularity: per_channel # The granularity of weight quantification
     act:
-        bit: 8 # Number of quantization bits for activations
-        symmetric: True # Whether activation quantization is symmetric
-        granularity: per_token # Granularity of activation quantization
-    speical: # Special parameters required for the quantization algorithm. Refer to the comments in the configuration file and the original paper for usage.
+        bit: 8 # Number of activated quantization bits
+        symmetric: True # Whether activation quantization is symmetric quantization
+        granularity: per_token # The granularity of activation quantification
 save:
-    save_vllm: False # Whether to save the real quantized model for VLLM inference
-    save_sgl: False # Whether to save the real quantized model for Sglang inference
-    save_autoawq: False # Whether to save the real quantized model for AutoAWQ inference
-    save_mlcllm: False # Whether to save the real quantized model for MLC-LLM inference
-    save_trans: False # Whether to save the model after weight transformation
-    save_fake: False # Whether to save the fake quantized weights
-    save_path: /path/to/save # Save path
+    save_trans: False # Whether to save the adjusted model
+    save_path: ./save # Save path
 ```
 
 # Configs' detailed description
@@ -362,45 +355,12 @@ quant:
 
 ## save
 
-<font color=792ee5> save.save_vllm</font>
+<font color=792ee5> save.save_trans </font>
 
-Whether to save as a [VLLM](https://github.com/vllm-project/vllm) inference backend-supported real quantized model.
+Whether to save the adjusted model weights
 
-When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the VLLM backend. This improves inference speed and reduces memory usage. For more details on the [VLLM](https://github.com/vllm-project/vllm) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/vllm.html#).
+The saved weight is the weight that is more suitable for quantization after adjustment, and it is still saved in the form of FP16, and when it is deployed in the inference engine, you need to enable NAIVE quantization to achieve quantitative inference
 
-<font color=792ee5> save.save_sgl</font>
+<font color=792ee5> save.save_path </font>
 
-Whether to save as a [Sglang](https://github.com/sgl-project/sglang) inference backend-supported real quantized model.
-
-When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [Sglang](https://github.com/sgl-project/sglang) backend. This improves inference speed and reduces memory usage. For more details on the [Sglang](https://github.com/sgl-project/sglang) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/sglang.html).
-
-<font color=792ee5> save.save_autoawq</font>
-
-Whether to save as an [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) inference backend-supported real quantized model.
-
-When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) backend. This improves inference speed and reduces memory usage. For more details on the [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/autoawq.html).
-
-<font color=792ee5> save.save_mlcllm</font>
-
-Whether to save as an [MLC-LLM](https://github.com/mlc-ai/mlc-llm) inference backend-supported real quantized model.
-
-When this option is enabled, the saved model weights will significantly shrink (real quantization), and it can be directly loaded for inference using the [MLC-LLM](https://github.com/mlc-ai/mlc-llm) backend. This improves inference speed and reduces memory usage. For more details on the [MLC-LLM](https://github.com/mlc-ai/mlc-llm) inference backend, refer to [this section](https://llmc-en.readthedocs.io/en/latest/backend/mlcllm.html).
-
-<font color=792ee5> save.save_trans</font>
-
-Whether to save the adjusted model weights.
-
-The saved weights are adjusted to be more suitable for quantization, possibly containing fewer outliers. They are still saved in fp16/bf16 format (with the same file size as the original model). When deploying the model in the inference engine, the engine's built-in `naive quantization` needs to be used to achieve quantized inference.
-
-Unlike `save_vllm` and similar options, this option requires the inference engine to perform real quantization, while `llmc` provides a floating-point model weight that is more suitable for quantization.
-
-For example, the `save_trans` models exported by algorithms such as `SmoothQuant, Os+, AWQ, and Quarot` have `fewer outliers` and are more suitable for quantization.
-
-
-<font color=792ee5> save.save_fake</font>
-
-Whether to save the fake quantized model.
-
-<font color=792ee5> save.save_path</font>
-
-The path where the model is saved. This path must be a new, non-existent directory, otherwise, LLMC will terminate the run and issue an appropriate error message.
\ No newline at end of file
+Save the path of the model, which needs to be a new directory path that does not exist, otherwise the llmc will terminate the operation with a corresponding error message
diff --git a/docs/en/source/index.rst b/docs/en/source/index.rst
index ba663eb57..ea56418eb 100644
--- a/docs/en/source/index.rst
+++ b/docs/en/source/index.rst
@@ -29,25 +29,7 @@ arxiv: https://arxiv.org/abs/2405.06001
    :maxdepth: 2
    :caption: Advanced
 
-   advanced/model_test_v1.md
-   advanced/model_test_v2.md
+   advanced/model_test.md
    advanced/custom_dataset.md
    advanced/mix_bits.md
    advanced/sparsification.md
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Best Practice
-
-   practice/awq.md
-   practice/awq_omni.md
-   practice/quarot_gptq.md
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Backbend
-
-   backend/vllm.md
-   backend/sglang.md
-   backend/autoawq.md
-   backend/mlcllm.md
diff --git a/docs/en/source/quickstart.md b/docs/en/source/quickstart.md
index 303f26111..9e9a118c0 100644
--- a/docs/en/source/quickstart.md
+++ b/docs/en/source/quickstart.md
@@ -1,53 +1,48 @@
-
-# Installing LLMC
+# Installation of llmc
 
 ```
 git clone https://github.com/ModelTC/llmc.git
-cd llmc/
 pip install -r requirements.txt
 ```
 
-# Preparing the Model
-
-**LLMC** currently supports only `hugging face` format models. For example, you can find the `Qwen2-0.5B` model [here](https://huggingface.co/Qwen/Qwen2-0.5B). Instructions for downloading can be found [here](https://zhuanlan.zhihu.com/p/663712983).
+llmc does not need to be installed. To use llmc you only need to add this to the script.
+```
+PYTHONPATH=[llmc's save path]:$PYTHONPATH
+```
 
-For users in Mainland China, you can also use the [hugging face mirror](https://hf-mirror.com/).
+# Prepare the model
 
-An example of a simple download can be:
+Currently, llmc only supports models in the Hugging Face format. In the case of Qwen2-0.5B, the model can be found [here](https://huggingface.co/Qwen/Qwen2-0.5B). 
 
+A simple download example can be used: 
 ```
 pip install -U hf-transfer
 
-HF_ENDPOINT=https://hf-mirror.com HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --resume-download Qwen/Qwen2-0.5B --local-dir Qwen2-0.5B
+HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --resume-download Qwen/Qwen2-0.5B --local-dir Qwen2-0.5B
 ```
 
-# Downloading the Dataset
+# Download the datasets
 
-**LLMC** requires datasets which are categorized into `calibration datasets` and `evaluation datasets`. The `calibration dataset` can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py) and the `evaluation dataset` can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py).
+The datasets required by llmc can be divided into calibration datasets and eval datasets. The calibration dataset can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py), and the eval dataset can be downloaded [here](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py).
 
-Additionally, **LLMC** supports downloading datasets online, by setting `download` to True in the `config`.
+Of course, llmc also supports online download of datasets, as long as the download in the config is set to True.
 
-```yaml
-calib:
-    name: pileval
-    download: True
-```
 
-# Setting Configuration Files
+# Set Configs
 
-All `configuration files` can be found [here](https://github.com/ModelTC/llmc/blob/main/configs/), and details on the `configuration files` can be referenced [in this section](https://llmc-en.readthedocs.io/en/latest/configs.html). For example, the SmoothQuant `config` is available [here](https://github.com/ModelTC/llmc/blob/main/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml).
+In the case of smoothquant, the config is [here](https://github.com/ModelTC/llmc/blob/main/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml).
 
-```yaml
+```
 base:
     seed: &seed 42
 model:
-    type: Qwen2 # Set model name, supporting models like Llama, Qwen2, Llava, Gemma2, etc.
-    path: # Set the model weight path
+    type: Qwen2 # Set the model name, which can support Llama, Qwen2, Llava, Gemma2 and other models.
+    path: # Set model weight path.
     torch_dtype: auto
 calib:
     name: pileval
     download: False
-    path: # Set calibration dataset path
+    path: # Set calibration dataset path.
     n_samples: 512
     bs: 1
     seq_len: 512
@@ -57,7 +52,7 @@ eval:
     eval_pos: [pretrain, transformed, fake_quant]
     name: wikitext2
     download: False
-    path: # Set evaluation dataset path
+    path: # Set eval dataset path.
     bs: 1
     seq_len: 2048
 quant:
@@ -71,41 +66,40 @@ quant:
         symmetric: True
         granularity: per_token
 save:
-    save_vllm: True # If set to True, the real quantized integer model is saved for inference with VLLM engine
-    save_trans: False # If set to True, adjusted floating-point weights will be saved
+    save_trans: True # Set to True to save the adjusted weights.
     save_path: ./save
 ```
 
-For more options and details about `save`, please refer to [this section](https://llmc-en.readthedocs.io/en/latest/configs.html).
+# Start to run
 
-**LLMC** provides many [algorithm configuration files](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods) under the `configs/quantization/methods` path for reference.
+Once you are prepared above, you can run the following commands
+```
+PYTHONPATH=[llmc's save path]:$PYTHONPATH \
+python -m llmc \
+--config configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml
+```
+Under scripts file folder, llmc also provides a lot of running [scripts](https://github.com/ModelTC/llmc/tree/main/scripts) for your reference
 
-# Running LLMC
+```
+#!/bin/bash
 
-**LLMC** does not require installation; simply modify the `local path` of **LLMC** in the [run script](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh) as follows:
+gpu_id=0 # Set the GPU id used.
+export CUDA_VISIBLE_DEVICES=$gpu_id
 
-```bash
-llmc=/path/to/llmc
+llmc= # Set the save path of llmc.
 export PYTHONPATH=$llmc:$PYTHONPATH
-```
 
-You need to modify the configuration path in the [run script](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh) according to the algorithm you want to run. For example, `${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml` refers to the SmoothQuant quantization configuration file. `task_name` specifies the name of the `log file` generated by **LLMC** during execution.
+task_name=smoothquant_llama_w8a8_fakequant_eval # Set task_name, the file name used to save the log.
 
-```bash
-task_name=smooth_w_a
-config=${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml
-```
-
-Once you have modified the LLMC path and config path in the run script, execute it:
+# Select a config to run.
+nohup \
+python -m llmc \
+--config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
 
-```bash
-bash run_llmc.sh
+echo $! > ${task_name}.pid
 ```
 
-# Quantization Inference
-
-If you have set the option to save `real quantized` models in the configuration file, such as `save_vllm: True`, then the saved `real quantized models` can be directly used for inference with the corresponding `inference backends`. For more details, refer to the `Backend` section of the [documentation](https://llmc-en.readthedocs.io/en/latest).
-
 # FAQ
 
 **<font color=red> Q1 </font>** 
diff --git a/docs/zh_cn/source/advanced/model_test.md b/docs/zh_cn/source/advanced/model_test.md
new file mode 100644
index 000000000..3db87caf4
--- /dev/null
+++ b/docs/zh_cn/source/advanced/model_test.md
@@ -0,0 +1,180 @@
+# 模型精度测试
+
+## 精度测试流程
+
+llmc支持基础的ppl(perplexity，困惑度)评测，但是更多的下游任务评测，llmc本身并不支持。
+
+常见的做法使用评测工具直接对模型进行推理测试，目前已有的评测工具包括但不限于
+
+1. [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
+
+2. [opencompass](https://github.com/open-compass/opencompass)
+
+但是这种评测方法评测效率不高，我们推荐使用**推理引擎评测工具分离**的方式进行模型精度评测，模型由推理引擎进行推理，并以api的形式serving起来，评测工具对该api进行评测。这种方式有以下的好处：
+
+1. 使用高效的推理引擎进行模型推理，可以加速整个评测进程
+
+2. 将模型的推理和模型的评测分离开，各自负责份内专业的事，代码结构更清晰
+
+3. 使用推理引擎推理模型，更符合实际部署的场景，和模型实际部署的精度更容易对齐
+
+我们在此推荐并介绍使用以下的模型的压缩-部署-评测流程：**llmc压缩-lightllm推理-opencompass评测**
+
+以下是相关工具的链接：
+
+1. llmc，大模型压缩工具，[[github](https://github.com/ModelTC/llmc),[文档](https://llmc-zhcn.readthedocs.io/en/latest/)]
+
+2. lightllm，大模型推理引擎，[[github](https://github.com/ModelTC/lightllm)]
+
+3. opencompass，大模型评测工具，[[github](https://github.com/open-compass/opencompass),[文档](https://opencompass.readthedocs.io/zh-cn/latest/)]
+
+
+## lightllm推理引擎的使用
+
+[lightllm](https://github.com/ModelTC/llmc)官方仓库有着更详细的文档，这里仅给出一个简单快速入门的使用文档
+
+<font color=792ee5> 起一个float模型的服务 </font>
+
+**安装lightllm**
+
+```
+git clone https://github.com/ModelTC/lightllm.git
+cd lightllm
+pip install -v -e .
+```
+
+**起服务**
+
+```
+python -m lightllm.server.api_server --model_dir 模型路径            \
+                                     --host 0.0.0.0                 \
+                                     --port 1030                    \
+                                     --nccl_port 2066               \
+                                     --max_req_input_len 6144       \
+                                     --max_req_total_len 8192       \
+                                     --tp 2                         \
+                                     --trust_remote_code            \
+                                     --max_total_token_num 120000
+```
+
+上述命令将在本机的1030端口，起一个2卡的服务
+
+上述命令可以通过tp的数量设置，在tp张卡上进行TensorParallel推理，适用于较大的模型的推理。
+
+上述命令中的max_total_token_num，会影响测试过程中的吞吐性能，可以根据[lightllm文档](https://github.com/ModelTC/lightllm/blob/main/docs/ApiServerArgs.md)，进行设置。只要不爆显存，往往设置越大越好。
+
+如果要在同一个机器上起多个lightllm服务，需要重新设定上面的port和nccl_port，不要有冲突即可。
+
+
+<font color=792ee5> 对服务进行简单测试 </font>
+
+执行下面的python脚本
+
+```
+import requests
+import json
+
+url = 'http://localhost:1030/generate'
+headers = {'Content-Type': 'application/json'}
+data = {
+    'inputs': 'What is AI?',
+    "parameters": {
+        'do_sample': False,
+        'ignore_eos': False,
+        'max_new_tokens': 128,
+    }
+}
+response = requests.post(url, headers=headers, data=json.dumps(data))
+if response.status_code == 200:
+    print(response.json())
+else:
+    print('Error:', response.status_code, response.text)
+```
+
+若上述脚本是有正常返回，说明服务正常
+
+<font color=792ee5> 起一个量化模型的服务 </font>
+
+```
+python -m lightllm.server.api_server --model_dir 模型路径            \
+                                     --host 0.0.0.0                 \
+                                     --port 1030                    \
+                                     --nccl_port 2066               \
+                                     --max_req_input_len 6144       \
+                                     --max_req_total_len 8192       \
+                                     --tp 2                         \
+                                     --trust_remote_code            \
+                                     --max_total_token_num 120000   \
+                                     --mode triton_w4a16
+```
+
+上述命令加了一个`--mode triton_w4a16`，表示使用了w4a16的naive量化
+
+起完服务，同样需要验证一下服务是否正常
+
+上述的命令使用的模型路径是原始预训练的模型，并没有经过llmc调整。可以按照llmc的文档，打开save_trans，保存一个调整之后的模型，然后再运行上述的naive量化服务命令
+
+## opencompass评测工具的使用
+
+[opencompass](https://github.com/open-compass/opencompass)官方仓库有着更详细的文档，这里仅给出一个简单快速入门的使用文档
+
+**安装opencompass**
+
+```
+git clone https://github.com/open-compass/opencompass.git
+cd opencompass
+pip install -v -e .
+```
+
+**修改配置文件**
+
+配置文件在[这里](https://github.com/open-compass/opencompass/blob/main/configs/eval_lightllm.py)，这个配置文件是用于opencompass来评测lightllm的api服务的精度，需要注意的是里面的`url`里面的port，要和上述的lightllm的服务port保持一致
+
+评测的数据集选择，需要修改这部分代码
+
+```
+with read_base():
+    from .summarizers.leaderboard import summarizer
+    from .datasets.humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
+```
+
+上述的代码片段，表示测试humaneval数据集，更多的数据集测试支持，可以查看[这里](https://github.com/open-compass/opencompass/tree/main/configs/datasets)
+
+**数据集下载**
+
+需要根据opencompass的[文档](https://opencompass.readthedocs.io/zh-cn/latest/get_started/installation.html#id2)，最好数据集的准备
+
+**运行精度测试**
+
+修改好上述的配置文件后，即可运行下面的命令
+```
+python run.py configs/eval_lightllm.py
+```
+当模型完成推理和指标计算后，我们便可获得模型的评测结果。其中会在当前目录下生成output文件夹，logs子文件夹记录着评测中的日志，最后生成summary子文件会记录所测数据集的精度
+
+## 常见问题
+
+**<font color=red> 问题1 </font>** 
+
+opencompass中的数据集配置文件，同一个数据集有不同的后缀，表示的是什么意思
+
+**<font color=green> 解决方法 </font>** 
+
+不同后缀表示不同的prompt模板，详细的opencompass问题，可以查看opencompass文档
+
+**<font color=red> 问题2 </font>** 
+
+llama模型的humaneval的测试精度过低
+
+**<font color=green> 解决方法 </font>** 
+
+可能需要将opencompass提供的数据集中的humaneval的jsonl文件里面每一条末尾的\n给删除，再重新测试一下
+
+**<font color=red> 问题3 </font>** 
+
+测试速度还是不够快
+
+**<font color=green> 解决方法 </font>** 
+
+可以考虑lightllm起服务时的max_total_token_num参数设置是否合理，过小的设置，会导致测试并发偏低
+
diff --git a/docs/zh_cn/source/configs.md b/docs/zh_cn/source/configs.md
index 9fbf1382d..cd1b9f3d9 100644
--- a/docs/zh_cn/source/configs.md
+++ b/docs/zh_cn/source/configs.md
@@ -1,6 +1,6 @@
 # 配置的简要说明
 
-所有的配置均可以在[这里](https://github.com/ModelTC/llmc/tree/main/configs)找到，具体地，包括[量化算法](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods)，[量化实践以及方法组合技](https://github.com/ModelTC/llmc/tree/main/configs/quantization/combination), 以及[推理后端](https://github.com/ModelTC/llmc/tree/main/configs/quantization/backend)相关的配置
+所有的配置均可以在[这里](https://github.com/ModelTC/llmc/tree/main/configs)找到
 
 下面的是一个简要的配置例子
 
@@ -8,7 +8,7 @@
 base:
     seed: &seed 42 # 设置随机种子
 model:
-    type: model_type # 模型的类型
+    type: Llama # 模型的类型
     path: model path # 模型的路径
     tokenizer_mode: fast # 模型的tokenizer类型
     torch_dtype: auto # 模型的dtype
@@ -28,7 +28,6 @@ eval:
     path: eval data path # 评测数据集的路径
     bs: 1 # 评测数据集的batch size
     seq_len: 2048 # 评测数据集的长度
-    eval_token_consist: False # 是否评测量化模型和原始模型输出token的一致性
 quant:
     method: SmoothQuant # 压缩方法
     weight:
@@ -39,15 +38,9 @@ quant:
         bit: 8 # 激活的量化bit数
         symmetric: True # 激活量化是否是对称量化
         granularity: per_token # 激活量化的粒度
-    speical: # 量化算法需要的特殊参数，可参照每个算法的配置文件的注释以及原论文掌握其用法
 save:
-    save_vllm: False # 是否保存真实量化的模型，以供VLLM推理
-    save_sgl: False # 是否保存真实量化的模型，以供Sglang推理
-    save_autoawq: False # 是否保存真实量化的模型，以供AutoAWQ推理
-    save_mlcllm: False # 是否保存真实量化的模型，以供MLC-LLM推理
-    save_trans: False # 是否保存权重变换之后的模型
-    save_fake: False # 是否保存伪量化的权重
-    save_path: /path/to/save # 保存路径
+    save_trans: False # 是否保存调整之后的模型
+    save_path: ./save # 保存路径
 ```
 
 # 配置的详细说明
@@ -210,11 +203,10 @@ general在[base_dataset](https://github.com/ModelTC/llmc/blob/main/llmc/data/dat
 
 
 ## eval
-llmc默认支持评测量化模型的困惑度(PPL), 以及量化模型和原始模型输出token的一致性。此外还支持通过harness和opencompass评测下游任务的精度（可见[评测章节v1](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v1.md)和[v2](https://llmc-zhcn.readthedocs.io/en/latest/advanced/model_test_v2.md)）
 
 <font color=792ee5> eval.eval_pos </font>
 
-表示评测PPL的位点，目前支持三个位点可以被评测
+表示评测的位点，目前支持三个位点可以被评测
 
 1. pretrain
 
@@ -264,7 +256,7 @@ inference_per_block: True
 
 <font color=792ee5> 同时测试多个数据集 </font>
 
-llmc也支持同时评测多个数据集的PPL
+llmc也支持同时评测多个数据集
 
 下面是评测单个wikitext2数据集的例子
 
@@ -291,9 +283,6 @@ eval:
 
 如果直接使用llmc的[下载脚本](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py)，则共有上层目录就是`--save_path`所指定的数据集保存路径
 
-<font color=792ee5> eval.eval_token_consist </font>
-
-表示是否评测量化模型和原始模型输出token的一致性，取值范围[0,1], 越接近1越说明量化模型的性能越接近原始模型
 
 ## quant
 
@@ -317,24 +306,11 @@ eval:
 
 权重的量化粒度，支持以下粒度
 
-1. per_tensor
-
-2. per_channel
-
-3. per_group
-
-<font color=792ee5> quant.weight.group_size </font>
-
-当权重是per-group量化时，其表示group的大小
-
-<font color=792ee5> quant.weight.ste </font>
-
-在权重量化的取整过程中，是否用直通估计器(straight-through estimator)来使round函数可以产生梯度以便进行反向传播
-
-<font color=792ee5> quant.weight.calib </font>
+1. per tensor
 
-权重的校准方法，默认采用minmax，除此之外，llmc还支持learnable，mse两种方法，可能会取得更好的结果
+2. per channel
 
+3. per group
 
 <font color=792ee5> quant.act </font>
 
@@ -358,26 +334,15 @@ eval:
 
 3. per head
 
-<font color=792ee5> quant.act.ste </font>
-
-在激活量化的取整过程中，是否用直通估计器(straight-through estimator)来使round函数可以产生梯度以便进行反向传播
-
-<font color=792ee5> quant.act.calib </font>
-
-激活的校准方法，默认采用minmax，且只支持minmax
-
-其中如果quant.method设置的为RTN，激活量化可以支持静态per tensor设置，下面是，权重静态per-channel量化，激活静态per tensor量化的配置和激活动态per token 8bit量化的配置
+其中如果quant.method设置的为RTN，激活量化可以支持静态per tensor设置，下面是一个W8A8，激活静态per tensor量化的配置
 
 ```
 quant:
     method: RTN
-    # 静态per-channel量化
     weight:
         bit: 8
         symmetric: True
         granularity: per_channel
-
-    # 静态per-tensor量化
     act:
         bit: 8
         symmetric: True
@@ -385,63 +350,13 @@ quant:
         static: True
 ```
 
-```
-quant:
-    method: RTN
-    #静态per-channel量化
-    weight:
-        bit: 8
-        symmetric: True
-        granularity: per_channel
-
-    # 动态per-tensor量化
-    act:
-        bit: 8
-        symmetric: True
-        granularity: per_token
-```
-
 ## save
 
-<font color=792ee5> save.save_vllm </font>
-
-是否保存为[VLLM](https://github.com/vllm-project/vllm)推理后端支持的真实量化模型
-
-当开启该选项时，你会发现保存的模型权重显著变小(真实量化)，同时可以通过VLLM后端来直接加载推理，提高推理速度以及降低显存占用，有关于[VLLM](https://github.com/vllm-project/vllm)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/vllm.html)
-
-<font color=792ee5> save.save_sgl </font>
-
-是否保存为[Sglang](https://github.com/sgl-project/sglang)推理后端支持的真实量化
-
-当开启该选项时，你会发现保存的模型权重显著变小(真实量化)，同时可以通过[Sglang](https://github.com/sgl-project/sglang)后端来直接加载推理，提高推理速度以及降低显存占用，有关于[Sglang](https://github.com/sgl-project/sglang)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/sglang.html)
-
-
-<font color=792ee5> save.save_autoawq </font>
-
-是否保存为[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)推理后端支持的真实量化模型
-
-当开启该选项时，你会发现保存的模型权重显著变小(真实量化)，同时可以通过[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)后端来直接加载推理，提高推理速度以及降低显存占用，有关于[AutoAWQ](https://github.com/casper-hansen/AutoAWQ)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/autoawq.html)
-
-<font color=792ee5> save.save_mlcllm </font>
-
-是否保存为[MLC-LLM](https://github.com/mlc-ai/mlc-llm)推理后端支持的真实量化模型
-
-当开启该选项时，你会发现保存的模型权重显著变小(真实量化)，同时可以通过[MLC-LLM](https://github.com/mlc-ai/mlc-llm)后端来直接加载推理，提高推理速度以及降低显存占用，有关于[MLC-LLM](https://github.com/mlc-ai/mlc-llm)推理后端的内容见[该章节](https://llmc-zhcn.readthedocs.io/en/latest/backend/mlcllm.html)
-
-
 <font color=792ee5> save.save_trans </font>
 
 是否保存调整之后的模型权重
 
-保存的该权重，是经过调整之后的更适合量化的权重，其可能包含更少的离群值，其还是以fp16/bf16的格式保存(权重文件大小与原始模型保持一致)，在推理引擎中部署的时候，需要开启推理引擎的`naive量化`功能，即可实现量化推理。
-
-与`save_vllm`等不同的是，其需要该推理引擎来完成真实量化，而`llmc`提供一个更适合量化的浮点模型权重。
-
-例如`SmoothQuant/Os+/AWQ/Quarot`等算法导出的`save_trans`模型，其具有`更少的outliers`，更适合量化。
-
-<font color=792ee5> save.save_fake </font>
-
-是否保存伪量化的模型
+保存的该权重，是经过调整之后的更适合量化的权重，它还是以fp16形式保存，在推理引擎中部署的时候，需要开启naive量化，即可实现量化推理
 
 <font color=792ee5> save.save_path </font>
 
diff --git a/docs/zh_cn/source/index.rst b/docs/zh_cn/source/index.rst
index e07a1de84..a2e66ce60 100644
--- a/docs/zh_cn/source/index.rst
+++ b/docs/zh_cn/source/index.rst
@@ -30,25 +30,8 @@ arxiv链接: https://arxiv.org/abs/2405.06001
    :maxdepth: 2
    :caption: 进阶用法
 
-   advanced/model_test_v1.md
-   advanced/model_test_v2.md
+   advanced/model_test.md
    advanced/custom_dataset.md
    advanced/mix_bits.md
    advanced/sparsification.md
 
-.. toctree::
-   :maxdepth: 2
-   :caption: 量化最佳实践
-
-   practice/awq.md
-   practice/awq_omni.md
-   practice/quarot_gptq.md
-
-.. toctree::
-   :maxdepth: 2
-   :caption: 量化推理后端
-
-   backend/vllm.md
-   backend/sglang.md
-   backend/autoawq.md
-   backend/mlcllm.md
\ No newline at end of file
diff --git a/docs/zh_cn/source/quickstart.md b/docs/zh_cn/source/quickstart.md
index 6d4854d12..34811d21d 100644
--- a/docs/zh_cn/source/quickstart.md
+++ b/docs/zh_cn/source/quickstart.md
@@ -1,14 +1,18 @@
-# LLMC的安装
+# llmc的安装
 
 ```
 git clone https://github.com/ModelTC/llmc.git
-cd llmc/
 pip install -r requirements.txt
 ```
 
+llmc无需安装，使用llmc只需在脚本中添加
+```
+PYTHONPATH=llmc的下载路径:$PYTHONPATH
+```
+
 # 准备模型
 
-**LLMC**目前仅支持`hugging face`格式的模型。以`Qwen2-0.5B`为例，可以在[这里](https://huggingface.co/Qwen/Qwen2-0.5B)找到模型。下载方式可以参考[这里](https://zhuanlan.zhihu.com/p/663712983)
+llmc目前仅支持hugging face格式的模型。以Qwen2-0.5B为例，可以在[这里](https://huggingface.co/Qwen/Qwen2-0.5B)找到模型。下载方式可以参考[这里](https://zhuanlan.zhihu.com/p/663712983)
 
 大陆地区用户还可以使用[hugging face镜像](https://hf-mirror.com/)
 
@@ -21,22 +25,15 @@ HF_ENDPOINT=https://hf-mirror.com HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli do
 
 # 下载数据集
 
-**LLMC**需要的数据集可以分为`校准数据集`和`测试数据集`。`校准数据集`可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py)下载，`测试数据`集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py)下载
+llmc需要的数据集可以分为校准数据集和测试数据集。校准数据集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_calib_dataset.py)下载，测试数据集可以在[这里](https://github.com/ModelTC/llmc/blob/main/tools/download_eval_dataset.py)下载
 
-当然**LLMC**也支持在线下载数据集，只需要在`config`中的`download`设置为True即可。
+当然llmc也支持在线下载数据集，只需要在config中的download设置为True即可。
 
-```yaml
-calib:
-    name: pileval
-    download: True
-```
-
-# 设置配置文件
+# 设置config
 
-所有的`配置文件`都在[这里](https://github.com/ModelTC/llmc/blob/main/configs/)可以找到，同时关于`配置文件`的说明请参考[此章节](https://llmc-zhcn.readthedocs.io/en/latest/configs.html)
-以SmoothQuant为例，`config`在[这里](https://github.com/ModelTC/llmc/blob/main/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml)
+以smoothquant为例，config在[这里](https://github.com/ModelTC/llmc/blob/main/configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml)
 
-```yaml
+```
 base:
     seed: &seed 42
 model:
@@ -70,39 +67,39 @@ quant:
         symmetric: True
         granularity: per_token
 save:
-    save_vllm: True # 当设置为True时，可以保存真实量化的整型模型，并通过VLLM推理引擎进行推理
-    save_trans: False # 当设置为True，可以保存下调整之后的浮点权重
+    save_trans: True # 设置为True，可以保存下调整之后的权重
     save_path: ./save
 ```
-有关于`save`的更多选项和说明，请参照[此章节](https://llmc-zhcn.readthedocs.io/en/latest/configs.html)
-
-
-**LLMC**在`configs/quantization/methods`路径下，提供了很多的[算法配置文件](https://github.com/ModelTC/llmc/tree/main/configs/quantization/methods)供大家参考。
 
 # 开始运行
 
-**LLMC**无需安装，只需在[运行脚本](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh)中将`/path/to/llmc`修改为**LLMC**的`本地路径`即可。
-```bash
-llmc=/path/to/llmc
-export PYTHONPATH=$llmc:$PYTHONPATH
+做好上面的准备之后，可以通过以下的命令运行
 ```
+PYTHONPATH=llmc的下载路径:$PYTHONPATH \
+python -m llmc \
+--config configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml
+```
+llmc在scripts下，也提供了很多的运行[脚本](https://github.com/ModelTC/llmc/tree/main/scripts)供大家参考
 
-根据你想运行的算法，需相应修改[运行脚本](https://github.com/ModelTC/llmc/blob/main/scripts/run_llmc.sh)中的配置路径。例如，`${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml`对应的是 SmoothQuant 量化的配置文件。`task_name`用于指定**LLMC**运行时生成的`日志文件名称`。
-
-```bash
-task_name=smooth_w_a
-config=${llmc}/configs/quantization/methods/SmoothQuant/smoothquant_w_a.yml
 ```
+#!/bin/bash
 
-当在运行脚本中，修改完相应的LLMC路径和config路径后，运行即可：
+gpu_id=0 # 设置使用的GPU id
+export CUDA_VISIBLE_DEVICES=$gpu_id
 
-```bash
-bash run_llmc.sh
-```
+llmc= # 设置llmc的下载路径
+export PYTHONPATH=$llmc:$PYTHONPATH
 
-# 量化推理
+task_name=smoothquant_llama_w8a8_fakequant_eval # 设置task_name，用于保存log的文件名
 
-假设你在配置文件中指定了保存`真实量化`模型的选项，例如 `save_vllm: True`，那么保存的`真实量化模型`即可直接用于对应的`推理后端`执行，具体可参照[文档](https://llmc-zhcn.readthedocs.io/en/latest)的`量化推理后端`章节。
+# 选择某个config运行
+nohup \
+python -m llmc \
+--config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+```
 
 # 常见问题
 
diff --git a/examples/backend/autoawq/infer_with_autoawq.py b/examples/backend/autoawq/infer_with_autoawq.py
deleted file mode 100644
index a157f0620..000000000
--- a/examples/backend/autoawq/infer_with_autoawq.py
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-import sys
-
-autoawq_path = '/path/to/AutoAWQ'
-sys.path.append(autoawq_path)
-
-import torch
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer, TextStreamer
-
-model_path = '/path/to/save_for_autoawq_awq_w4/autoawq_quant_model'
-
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-
-model = AutoAWQForCausalLM.from_quantized(
-    model_path,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True,
-    device_map='auto',
-)
-
-
-prompt_text = 'The president of the United States is '
-inputs = tokenizer(prompt_text, return_tensors='pt').to('cuda')
-
-outputs = model.generate(
-    **inputs,
-    do_sample=False,
-    max_new_tokens=100,
-    streamer=streamer,
-    eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids('<|eot_id|>')]
-)
diff --git a/examples/backend/mlcllm/infer_with_mlcllm.py b/examples/backend/mlcllm/infer_with_mlcllm.py
deleted file mode 100644
index be9523aa7..000000000
--- a/examples/backend/mlcllm/infer_with_mlcllm.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from mlc_llm import MLCEngine
-
-# Create engine
-model_path = './dist/llama2-7b-chat-MLC/'
-engine = MLCEngine(model_path)
-
-# Run chat completion in OpenAI API.
-for response in engine.chat.completions.create(
-    messages=[{'role': 'user', 'content': 'What is the meaning of life?'}],
-    model=model_path,
-    stream=True,
-):
-    for choice in response.choices:
-        print(choice.delta.content, end='', flush=True)
-print('\n')
-
-engine.terminate()
diff --git a/examples/backend/sglang/infer_with_sglang.py b/examples/backend/sglang/infer_with_sglang.py
deleted file mode 100644
index 2a92b807c..000000000
--- a/examples/backend/sglang/infer_with_sglang.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import openai
-
-client = openai.Client(
-    base_url='http://127.0.0.1:10000/v1', api_key='EMPTY')
-
-# Text completion
-response = client.completions.create(
-    model='default',
-    prompt='The president of the United States is',
-    temperature=0,
-    max_tokens=32,
-)
-print(response)
diff --git a/examples/backend/vllm/infer_with_vllm.py b/examples/backend/vllm/infer_with_vllm.py
deleted file mode 100644
index 8b77349d1..000000000
--- a/examples/backend/vllm/infer_with_vllm.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
-
-model_path = '/path/to/save_for_vllm_awq_w4/real_quant_model'
-model = LLM(model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-prompts = [
-    'Hello, my name is',
-    'The president of the United States is',
-    'The capital of France is',
-    'The future of AI is',
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-outputs = model.generate(prompts, sampling_params)
-
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f'Prompt: {prompt!r}, Generated text: {generated_text!r}')
diff --git a/llmc/__main__.py b/llmc/__main__.py
index 08a03ab4e..62f43f670 100644
--- a/llmc/__main__.py
+++ b/llmc/__main__.py
@@ -6,19 +6,19 @@
 import time
 
 import torch
+import transformers
 import yaml
 from easydict import EasyDict
 from loguru import logger
-from torch.distributed import destroy_process_group, init_process_group
+from transformers import (LlamaTokenizerFast, Trainer, TrainingArguments,
+                          default_data_collator)
 
 from llmc.compression.quantization import *
 from llmc.compression.sparsification import *
-from llmc.data import BaseDataset, BaseTokenizer
-from llmc.eval import AccuracyEval, PerplexityEval, TokenConsistencyEval
+from llmc.data import BaseDataset, BaseTokenizer, TrainJsonDataset
+from llmc.eval import PerplexityEval
 from llmc.models import *
-from llmc.utils import (check_config, mkdirs, print_important_package_version,
-                        seed_all, update_autoawq_quant_config,
-                        update_vllm_quant_config)
+from llmc.utils import check_config, mkdirs, seed_all
 from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY
 
 
@@ -43,66 +43,99 @@ def main(config):
             eval_config.name = name
             if len(name_list) != 1:  # eval multi datasets
                 eval_config.path = os.path.join(config.eval.path, name)
-            if config.eval.type == 'acc':
-                acc_eval = AccuracyEval(eval_config)
-                eval_list.append(acc_eval)
-            else:
-                ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config)
-                eval_list.append(ppl_eval)
+            ppl_eval = PerplexityEval(tokenizer.get_tokenizer(), eval_config)
+            eval_list.append(ppl_eval)
 
     if 'eval' in config and 'pretrain' in config.eval.eval_pos:
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+        for ppl_eval in eval_list:
+            ppl = ppl_eval.eval(model)
+            logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
     if not config.get('calib', False):
         blockwise_opt = ALGO_REGISTRY[config.quant.method](
-            model,
-            quant_config=config.quant,
-            input=None,
-            padding_mask=None,
-            config=config
+            model, quant_config=config.quant, input=None, config=config
         )
         blockwise_opt.run_block_loop()
     else:
-        dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib, model.processor)
-        calib_data, padding_mask = dataset.get_calib_dataset()
-        model.collect_first_block_input(calib_data, config.calib.type)
+        dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib)
+        calib_data = dataset.get_calib_dataset()
+        model.collect_first_block_input(calib_data)
         del calib_data
         gc.collect()
         torch.cuda.empty_cache()
         if not config.get('sparse', False):
             blockwise_opt = ALGO_REGISTRY[config.quant.method](
-                model,
-                config.quant,
-                model.get_first_block_input(),
-                padding_mask,
-                config
+                model, config.quant, model.get_first_block_input(), config
             )
         else:
             blockwise_opt = ALGO_REGISTRY[config.sparse.method](
-                model,
-                config.sparse,
-                model.get_first_block_input(),
-                padding_mask,
-                config
+                model, config.sparse, model.get_first_block_input(), config
             )
         blockwise_opt.run_block_loop()
 
+
+    if 'train' in config:
+
+        blockwise_opt.deploy('train_rotate_quant')
+
+        dataset = BaseDataset(tokenizer.get_tokenizer(), config.train.data)
+
+        train_tokenizer = LlamaTokenizerFast.from_pretrained(
+            pretrained_model_name_or_path=config.model.path,
+            cache_dir=config.train.data.cache_dir,
+            model_max_length=config.train.data.seq_len,
+            padding_side='right',
+            use_fast=True,
+            add_eos_token=False,
+            add_bos_token=False,
+        )
+
+
+        if 'eval' in config and len(config.eval.eval_pos):
+            eval_list = []
+            name_list = (
+                config.eval.name
+                if not isinstance(config.eval.name, str)
+                else [config.eval.name]
+            )
+            for name in name_list:
+                eval_config = copy.deepcopy(config.eval)
+                eval_config.name = name
+                if len(name_list) != 1:  # eval multi datasets
+                    eval_config.path = os.path.join(config.eval.path, name)
+                ppl_eval = PerplexityEval(train_tokenizer, eval_config)
+                eval_list.append(ppl_eval)
+
+        train_data = TrainJsonDataset(
+            dataset.calib_dataset,
+            train_tokenizer,
+            block_size=config.train.data.seq_len,
+        )
+
+        train_args = TrainingArguments(**config.train.train_args)
+        trainable_parameters = blockwise_opt.get_trainable_params()
+        blockwise_opt.model.model.seqlen = config.train.data.seq_len
+        optimizer = SGDG(trainable_parameters, lr=config.train.train_args.learning_rate, stiefel=True)
+
+        trainer = Trainer(
+            model=blockwise_opt.model.model,
+            tokenizer=train_tokenizer,
+            args=train_args,
+            train_dataset=train_data,
+            eval_dataset=None,
+            data_collator=default_data_collator,
+            optimizers=(optimizer, None),
+        )
+
+        trainer.train()
+
+        logger.info('End training')
+
+
     if 'eval' in config and 'transformed' in config.eval.eval_pos:
         blockwise_opt.deploy('origin_float')
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
+        for ppl_eval in eval_list:
+            ppl = ppl_eval.eval(model)
+            logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
 
     if 'save' in config and config.save.get('save_trans', False):
         blockwise_opt.save_model(save_trans_path)
@@ -119,123 +152,35 @@ def main(config):
 
     if 'eval' in config and 'fake_quant' in config.eval.eval_pos:
         blockwise_opt.deploy('fake_quant')
-        if config.eval.type == 'acc':
-            for acc_eval in eval_list:
-                acc = acc_eval.eval(model)
-                logger.info(f'{config.eval.name} acc : {acc}')
-        else:
-            for ppl_eval in eval_list:
-                ppl = ppl_eval.eval(model)
-                logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
-
-        if 'eval_token_consist' in config.eval and config.eval.eval_token_consist:
-            org_model = MODEL_REGISTRY[config.model.type](
-                config.model.path, config.model.torch_dtype
-            )
-            token_consist_eval = TokenConsistencyEval(tokenizer.get_tokenizer(),
-                                                      eval_config)
-            consistency_ratio = token_consist_eval.eval(model, org_model)
-            logger.info(f'Token consistency ratio: {consistency_ratio}')
-            del org_model
+        for ppl_eval in eval_list:
+            ppl = ppl_eval.eval(model)
+            logger.info(f'{ppl_eval.dataset} ppl : {ppl}')
 
     if 'save' in config and config.save.get('save_fake', False):
         blockwise_opt.deploy('fake_quant')
         blockwise_opt.save_model(save_fake_path)
 
-    if 'save' in config and config.save.get('save_vllm', False):
-        w, a = config.quant.weight, config.quant.get('act')
-        if isinstance(w.bit, str):
-            assert a, 'Only WA float quant is supported.'
-            assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
-                a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
-        else:
-            assert w.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
-            if a:
-                assert a.symmetric, 'Only symmetric quant is supported.'
-                assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
-        blockwise_opt.deploy('vllm_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_sgl', False):
-        w, a = config.quant.weight, config.quant.get('act')
-        if isinstance(w.bit, str):
-            assert a, 'Only WA float quant is supported.'
-            assert w.symmetric and a.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit == a.bit and w.bit in ['e4m3', 'e5m2'] and \
-                a.bit in ['e4m3', 'e5m2'], 'Only WA FP8 quant is supported'
-        else:
-            assert w.symmetric, 'Only symmetric quant is supported.'
-            assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
-            if a:
-                assert a.symmetric, 'Only symmetric quant is supported.'
-                assert a.bit == 8, 'Supported quant: w4a16, w8a16, w8a8.'
-        blockwise_opt.deploy('sgl_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_autoawq', False):
-        assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
-            'AutoAWQ supports only 4-bit weight-only quantization.'
-        assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
-
-        blockwise_opt.deploy('autoawq_quant')
-        blockwise_opt.save_model(save_quant_path)
-        update_autoawq_quant_config(config, save_quant_path)
-
-    if 'save' in config and config.save.get('save_mlcllm', False):
-        assert config.quant.weight.bit in [4] and 'act' not in config.quant, \
-            'MlcLLM supports only 4-bit weight-only quantization.'
-        assert not config.quant.weight.symmetric, 'Only asymmetric quant is supported.'
-
-        blockwise_opt.deploy('mlcllm_quant')
+    if 'save' in config and config.save.get('save_lightllm', False):
+        blockwise_opt.deploy('real_quant')
         blockwise_opt.save_model(save_quant_path)
-        update_autoawq_quant_config(config, save_quant_path)
-
-    if 'opencompass' in config:
-        assert config.save.get('save_trans', False)
-        cfg_path = config['opencompass']['cfg_path']
-        output_path = config['opencompass']['output_path']
-        eval_model_path = os.path.abspath(save_trans_path)
-        opencompass_cmd = (
-            f'opencompass {cfg_path} -w {output_path} '
-            f'--llmc_cfg {args.config} '
-            f'--llmc_eval_mode quant '
-            f'--llmc_model_path {eval_model_path}'
-        )
-        logger.info(f'opencompass_cmd : {opencompass_cmd}')
-        os.system(opencompass_cmd)
 
 
 if __name__ == '__main__':
     llmc_start_time = time.time()
     parser = argparse.ArgumentParser()
     parser.add_argument('--config', type=str, required=True)
-    parser.add_argument('--task_id', type=str, required=True)
     args = parser.parse_args()
 
     with open(args.config, 'r') as file:
         config = yaml.safe_load(file)
     config = EasyDict(config)
 
-    init_process_group(backend='nccl')
-    torch.cuda.set_device(int(os.environ['LOCAL_RANK']))
-
-    if int(os.environ['RANK']) != 0:
-        logger.remove()
-
     check_config(config)
 
     logger.info(f'args: {args}')
     logger.info(f'config:\n{json.dumps(config, ensure_ascii=False, indent=4)}')
 
-    print_important_package_version()
-
-    logger.info(f'WORLD_SIZE : {int(os.environ["WORLD_SIZE"])}')
-
-    seed_all(config.base.seed + int(os.environ['RANK']))
+    seed_all(config.base.seed)
 
     # mkdirs
     if 'save' in config:
@@ -251,17 +196,8 @@ def main(config):
                 config.save.save_path, 'trtllm_engine'
             )
             mkdirs(save_trtllm_engine_path)
-        if config.save.get('save_vllm', False):
-            save_quant_path = os.path.join(config.save.save_path, 'vllm_quant_model')
-            mkdirs(save_quant_path)
-        if config.save.get('save_sgl', False):
-            save_quant_path = os.path.join(config.save.save_path, 'sgl_quant_model')
-            mkdirs(save_quant_path)
-        if config.save.get('save_autoawq', False):
-            save_quant_path = os.path.join(config.save.save_path, 'autoawq_quant_model')
-            mkdirs(save_quant_path)
-        if config.save.get('save_mlcllm', False):
-            save_quant_path = os.path.join(config.save.save_path, 'mlcllm_quant_model')
+        if config.save.get('save_lightllm', False):
+            save_quant_path = os.path.join(config.save.save_path, 'real_quant_model')
             mkdirs(save_quant_path)
         if config.save.get('save_fake', False):
             save_fake_path = os.path.join(config.save.save_path, 'fake_quant_model')
@@ -269,8 +205,6 @@ def main(config):
 
     main(config)
 
-    destroy_process_group()
-
     llmc_end_time = time.time()
     llmc_duration_time = llmc_end_time - llmc_start_time
     logger.info(f'llmc_duration_time: {llmc_duration_time} s')
diff --git a/llmc/compression/blockwise_optimization.py b/llmc/compression/blockwise_optimization.py
index 0dc7ccbfa..d8d844c7c 100644
--- a/llmc/compression/blockwise_optimization.py
+++ b/llmc/compression/blockwise_optimization.py
@@ -5,13 +5,12 @@
 
 
 class BlockwiseOpt(metaclass=ABCMeta):
-    def __init__(self, model, quant_config, input, padding_mask, config):
+    def __init__(self, model, quant_config, input, config):
         self.model = model
         self.blocks = model.get_blocks()
         self.quant_config = quant_config
         self.sparsity_config = quant_config
         self.input = input
-        self.padding_mask = padding_mask
         self.data_free = False if self.input else True
         self.config = config
         self.block_idx = None
@@ -20,9 +19,6 @@ def __init__(self, model, quant_config, input, padding_mask, config):
             for i in range(len(input['kwargs'])):
                 if 'use_cache' in input['kwargs'][i]:
                     input['kwargs'][i].pop('use_cache')
-            for i in range(len(input['kwargs'])):
-                if 'past_key_value' in input['kwargs'][i]:
-                    input['kwargs'][i]['past_key_value'] = None
             self.n_samples = 0
             for i in range(len(input['data'])):
                 self.n_samples += input['data'][i].shape[0]
diff --git a/llmc/compression/quantization/__init__.py b/llmc/compression/quantization/__init__.py
index a57973ace..356d51d80 100644
--- a/llmc/compression/quantization/__init__.py
+++ b/llmc/compression/quantization/__init__.py
@@ -9,9 +9,11 @@
 from .ntweak import NormTweaking
 from .omniq import OmniQuant
 from .osplus import OsPlus
-from .quant import FloatQuantizer, IntegerQuantizer
+from .quant import Quantizer
 from .quarot import Quarot
 from .quik import QUIK
 from .rtn import RTN
 from .smoothquant import SmoothQuant
+from .spinquant import SpinQuant
 from .spqr import SpQR
+from .train_utils import SGDG
diff --git a/llmc/compression/quantization/awq.py b/llmc/compression/quantization/awq.py
index 99b28709b..8a2b291e6 100644
--- a/llmc/compression/quantization/awq.py
+++ b/llmc/compression/quantization/awq.py
@@ -1,8 +1,6 @@
 import gc
-import os
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from loguru import logger
 
@@ -17,8 +15,8 @@
 
 @ALGO_REGISTRY
 class Awq(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         special_config = self.quant_config.get('special', {})
         self.trans = special_config.get('trans', True)
         self.trans_version = special_config.get('trans_version', 'v2')
@@ -40,10 +38,7 @@ def get_weight_scale(self, layers_dict):
         )
         weights = wquantizer.reshape_tensor(weights)
         scale = weights.abs() / weights.abs().amax(dim=1, keepdim=True)
-        try:
-            scale = scale.view(org_shape)
-        except RuntimeError:
-            scale = wquantizer.restore_tensor(scale, org_shape)
+        scale = scale.view(org_shape)
         scale = scale.mean(0)
         del weights
         gc.collect()
@@ -126,15 +121,12 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs)
                         self.quantizer_mix_bits,
                         self.aquantizer,
                     ).fake_quant_act_dynamic(x_tmp)
+
                 out = inspect_module(x_tmp, **kwargs)
 
                 if isinstance(out, tuple):
                     out = out[0]
 
-                if self.padding_mask:
-                    org_out = org_out * self.padding_mask[i].unsqueeze(dim=-1).to(org_out.device) # noqa
-                    out = out * self.padding_mask[i].unsqueeze(dim=-1).to(out.device)
-
                 loss = (org_out - out).float().pow(2).mean().item()
                 loss_mean += x.shape[0] * 1.0 / self.n_samples * loss
                 scales_mean += x.shape[0] * 1.0 / self.n_samples * scales
@@ -144,8 +136,6 @@ def search_scale_subset(self, layers_dict, input, inspect_module, subset_kwargs)
                 best_error = loss_mean
                 best_scales = scales_mean
         best_scales = best_scales.view(-1)
-        dist.all_reduce(best_scales, op=dist.ReduceOp.SUM)
-        best_scales /= int(os.environ['WORLD_SIZE'])
         del org_out_dict
         gc.collect()
         torch.cuda.empty_cache()
@@ -166,11 +156,7 @@ def block_transform(self, block, input_feat, block_kwargs):
         if self.weight_clip:
             logger.info('auto_clip start')
             logger.info(f'clip version: {self.clip_version}')
-            self.auto_clip(
-                block,
-                input_feat,
-                n_sample_token=self.config.calib.get('seq_len', None)
-            )
+            self.auto_clip(block, input_feat, n_sample_token=self.config.calib.seq_len)
             logger.info('auto_clip finished')
         else:
             logger.info('disable weight clip')
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
index 09ad2b66a..242883491 100644
--- a/llmc/compression/quantization/base_blockwise_quantization.py
+++ b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -1,12 +1,10 @@
 import functools
 import gc
 import json
-import os
 from collections import defaultdict
 from functools import partial
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from loguru import logger
 
@@ -15,16 +13,18 @@
 from ..blockwise_optimization import BlockwiseOpt
 from .hadamard_utils import apply_exact_had_to_linear, get_hadK
 from .module_utils import (_LLMC_LINEAR_TYPES_, _LLMC_LN_TYPES_,
-                           _REALQUANT_LINEAR_MAP_, _TRANSFORMERS_LINEAR_TYPES_,
+                           _TRANSFORMERS_LINEAR_TYPES_,
                            _TRANSFORMERS_LN_TYPES_, EffcientFakeQuantLinear,
-                           FakeQuantLinear, OriginFloatLinear, RotateLinear)
-from .quant import FloatQuantizer, IntegerQuantizer
+                           FakeQuantLinear, OriginFloatLinear, RealQuantLinear,
+                           RotateLinear)
+from .quant import Quantizer
+from .rotate_utils import ActRotater, WeightRotater
 from .utils import check_do_quant, check_w_only, get_aquantizer, get_wquantizer
 
 
 class BaseBlockwiseQuantization(BlockwiseOpt):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.set_quant_config()
 
     def w_qdq(self, module, wquantizer):
@@ -45,7 +45,14 @@ def a_qdq(self, act, module, aquantizer):
     def logit(self, x):
         return torch.log(x / (1 - x))
 
-    def get_replacement_params(self, mode='fake_quant', w_only=False, name=None):
+    def random_orthogonal_matrix(self, hidden_size, dev):
+        torch.cuda.empty_cache()
+        random_matrix = torch.randn(size, size, dtype=torch.float64).to(device)
+        q, r = torch.linalg.qr(random_matrix)
+        q *= torch.sign(torch.diag(r)).unsqueeze(0)
+        return q
+
+    def get_replacement_params(self, mode='fake_quant', w_only=False, name=None, args={}):
         params_dict = {}
         if mode == 'fake_quant':
             if not self.mix_bits:
@@ -65,37 +72,47 @@ def get_replacement_params(self, mode='fake_quant', w_only=False, name=None):
                 params_dict['aquantizer_default'] = self.aquantizer
                 params_dict['w_only_default'] = w_only
 
-        elif mode in _REALQUANT_LINEAR_MAP_.keys():
+        elif mode == 'real_quant':
             params_dict['w_q'] = partial(self.w_q, wquantizer=self.wquantizer)
             params_dict['quant_config'] = self.quant_config
 
-        elif mode == 'online_rotate':
+        elif mode == 'rotate':
+            params_dict['w_rot'], params_dict['a_rot'] = None, None
+            if hasattr(self, 'weight_rotate') and self.weight_rotate:
+                params_dict['w_rot'] = partial(self.w_rot, w_rotater=self.w_rotater, args=args)
+
+            if hasattr(self, 'online_rotate') and self.online_rotate:
+                if hasattr(self, 'weight_rotate') and self.weight_rotate:
+                    if name is None or not 'down_proj' in name:
+                        return params_dict
+                else:
+                    if name is None or not ('down_proj' in name):
+                        return params_dict
+
+                had_K, K = get_hadK(
+                    self.intermediate_size if 'down_proj' in name else self.num_heads
+                )
+                a_rotater = ActRotater(
+                    online_full_had=True if 'down_proj' in name else False,
+                    online_partial_had=True if 'o_proj' in name else False,
+                    fp32_had=self.fp32_had,
+                    K=K,
+                    had_K=had_K,
+                    had_dim=None if 'down_proj' in name else self.hidden_size // self.num_heads,
+                )
+                params_dict['a_rot'] = partial(self.a_rot, a_rotater=a_rotater)
 
-            had_K, K = get_hadK(
-                self.intermediate_size if 'down_proj' in name else self.num_heads
-            )
-            params_dict = {
-                'had_K': had_K,
-                'K': K,
-                'online_full_had': 'down_proj' in name,
-                'online_partial_had': 'o_proj' in name,
-                'had_dim': (
-                    None if 'down_proj' in name else self.hidden_size // self.num_heads
-                ),
-                'fp32_had': self.fp32_had,
-            }
 
         return params_dict
 
     def alloc_bits(self, mix_bits_settings):
-
         for i in range(len(mix_bits_settings)):
             mix_bits_setting = mix_bits_settings[f'setting_{i}']
             if mix_bits_setting['do_quant']:
-                wquantizer_mix_bits = self.quant_module(**mix_bits_setting['weight'])
+                wquantizer_mix_bits = Quantizer(**mix_bits_setting['weight'])
                 if 'act' in mix_bits_setting:
                     w_only_mix_bits = False
-                    aquantizer_mix_bits = self.quant_module(**mix_bits_setting['act'])
+                    aquantizer_mix_bits = Quantizer(**mix_bits_setting['act'])
                 else:
                     w_only_mix_bits = True
                 self.quantizer_mix_bits.append(
@@ -145,25 +162,14 @@ def set_quant_config(self):
         self.quantizer_mix_bits = []
 
         self.quant_out = self.quant_config.get('quant_out', False)
-        self.tp = self.quant_config.get('tp', 1)
-        self.quant_config['weight']['tp'] = self.tp
-
-        # select quant module
-        self.quant_type = self.quant_config.get('quant_type', 'int_quant')
-        if self.quant_type == 'int_quant':
-            self.quant_module = IntegerQuantizer
-        else:
-            self.quant_module = FloatQuantizer
-        logger.info(f'The used Quant Module is {self.quant_module}')
 
         # set weight quant config
-        self.wquantizer = self.quant_module(**self.quant_config['weight'])
+        self.wquantizer = Quantizer(**self.quant_config['weight'])
 
         # set act quant config
         if 'act' in self.quant_config:
             self.w_only = False
-            self.quant_config['act']['tp'] = self.tp
-            self.aquantizer = self.quant_module(**self.quant_config['act'])
+            self.aquantizer = Quantizer(**self.quant_config['act'])
         else:
             self.w_only = True
             self.aquantizer = None
@@ -210,28 +216,11 @@ def set_quant_config(self):
             assert self.config['model']['type'] in ['Opt', 'Llama']
 
         self.hidden_size = self.model.model_config.hidden_size
-        if self.online_rotate:
-            self.num_heads = self.model.model_config.num_attention_heads
-            self.head_dim = self.hidden_size // self.num_heads
-            self.intermediate_size = self.model.model_config.intermediate_size
-            self.fp32_had = special_config.get('fp32_had', False)
-
-    def replace_rotate_linears(self, block):
-        for n, m in block.named_modules():
-            if isinstance(m, nn.Linear) and ('down_proj' in n
-                                             or 'o_proj' in n
-                                             or 'fc2' in n
-                                             or 'out_proj' in n):
-                subset = {'layers': {n: m}}
-                self.model.replace_module_subset(
-                    RotateLinear,
-                    block,
-                    subset,
-                    None,
-                    self.get_replacement_params(
-                        mode='online_rotate', w_only=self.w_only, name=n
-                    ),
-                )
+        # if self.online_rotate:
+        self.num_heads = self.model.model_config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.intermediate_size = self.model.model_config.intermediate_size
+        self.fp32_had = special_config.get('fp32_had', False)
 
     def block_forward(self, block, input_data=None):
         output = []
@@ -256,19 +245,15 @@ def block_forward(self, block, input_data=None):
     def block_opt(self, block):
         block = block.cuda()
         named_linears = self.model.get_block_linears(block)
-        extra_modules = self.model.get_extra_modules(block)
-        input_feat_modules = {
-            k: v for d in [named_linears, extra_modules] for k, v in d.items()
-        }
-        logger.info(f'input_feat_modules: {input_feat_modules}')
+        logger.info(f'named_linears: {named_linears}')
         input_feat = defaultdict(list)
         handles = []
         self.block_init(block)
 
         if not self.data_free:
-            for name in input_feat_modules:
+            for name in named_linears:
                 handles.append(
-                    input_feat_modules[name].register_forward_hook(
+                    named_linears[name].register_forward_hook(
                         functools.partial(
                             self.cache_input_hook, name=name, feat_dict=input_feat
                         )
@@ -334,7 +319,7 @@ def block_init(self, block):
     def filter_subset(self, subset):
         return True
 
-    def collect_layers_weights(self, layers, tensor_parallelize_style=None):
+    def collect_layers_weights(self, layers):
         weights = []
         for _m in layers:
             weights.append(_m.weight)
@@ -379,7 +364,7 @@ def apply_shift(self, shifts, prev_op, layers):
     def scale_fc_fc(self, fc1, fc2, scales):
         scales = scales.to(fc1.weight.device)
         if fc1.out_features == fc2.in_features * 3:
-            num_heads = self.model.get_num_attention_heads()
+            num_heads = self.model.get_model_config().to_dict().get('n_head', None)
             fc1.weight.t_()
             org_shape = fc1.weight.shape
             fc1.weight.data = fc1.weight.data.reshape(org_shape[0] * num_heads, 3, -1)
@@ -461,7 +446,7 @@ def scale_ln_fcs(self, ln, fcs, scales):
         scales = scales.to(ln.weight.device)
         ln.weight.div_(scales)
 
-        if hasattr(ln, 'bias') and ln.bias is not None:
+        if self.model.has_bias():
             ln.bias.div_(scales)
 
         for fc in fcs:
@@ -506,12 +491,6 @@ def auto_clip(self, block, input_feat, n_sample_token):
                     n_sample_token=n_sample_token,
                 )
 
-                dist.all_reduce(max_val, op=dist.ReduceOp.SUM)
-                max_val /= int(os.environ['WORLD_SIZE'])
-
-                dist.all_reduce(min_val, op=dist.ReduceOp.SUM)
-                min_val /= int(os.environ['WORLD_SIZE'])
-
                 self.apply_clip(m, min_val, max_val, n)
 
     @torch.no_grad()
@@ -519,20 +498,12 @@ def apply_clip(self, layer, min_val, max_val, layer_name):
         if self.clip_version == 'v1':
             max_val = max_val.to(layer.weight.device)
             org_shape = layer.weight.shape
-            try:
-                layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
-            except RuntimeError:
-                layer.weight.data = self.wquantizer.reshape_tensor(layer.weight.data)
-                layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+            layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
             if self.clip_sym:
                 min_val = -max_val
 
             layer.weight.data = torch.clamp(layer.weight.data, min_val, max_val)
-            try:
-                layer.weight.data = layer.weight.data.reshape(org_shape)
-            except RuntimeError:
-                layer.weight.data = self.wquantizer \
-                    .restore_tensor(layer.weight.data, org_shape)
+            layer.weight.data = layer.weight.data.reshape(org_shape)
         elif self.clip_version == 'v2':
             up_factor, low_factor = self.get_clip_factor(
                 layer, min_val, max_val, layer_name
@@ -607,11 +578,7 @@ def auto_clip_layer(
         else:
             group_size = w.shape[1]
 
-        try:
-            w = w.reshape(w.shape[0], 1, -1, group_size)
-        except RuntimeError:
-            w = self.wquantizer.reshape_tensor(w)
-            w = w.reshape(w.shape[0], 1, -1, group_size)
+        w = w.reshape(w.shape[0], 1, -1, group_size)
         oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64  # prevent OOM
         assert w.shape[0] % oc_batch_size == 0
 
@@ -620,7 +587,7 @@ def auto_clip_layer(
         best_min_val_all = []
 
         for i_b in range(w.shape[0] // oc_batch_size):
-            w = w_all[i_b * oc_batch_size: (i_b + 1) * oc_batch_size]
+            w = w_all[i_b * oc_batch_size : (i_b + 1) * oc_batch_size]
 
             if self.clip_sym:
                 org_max_val = w.abs().amax(dim=-1, keepdim=True)
@@ -648,17 +615,8 @@ def auto_clip_layer(
                     input[i] = input[i].to(w.device)
                     x = input[i]
                     x = x.view(-1, x.shape[-1])
-                    if self.padding_mask:
-                        mask_tmp = self.padding_mask[i].flatten()
-                        x = x[mask_tmp.bool()]
-                    try:
-                        x = x.reshape(1, x.shape[0], -1, group_size)
-                    except RuntimeError:
-                        x = self.wquantizer.reshape_tensor(x)
-                        x = x.reshape(1, x.shape[0], -1, group_size)
-                    if n_sample_token is None:
-                        n_sample_token = min(x.shape[1], 512)
-                    x = x[:, 0:: x.shape[1] // n_sample_token]
+                    x = x.reshape(1, x.shape[0], -1, group_size)
+                    x = x[:, 0 :: x.shape[1] // n_sample_token]
                     if i in org_out_dict:
                         org_out = org_out_dict[i]
                     else:
@@ -682,14 +640,14 @@ def auto_clip_layer(
                             w, low_factor, up_factor
                         )
 
-                        scales, zeros, qmax, qmin = wquantizer.get_qparams(
+                        scales, zeros, max_int, min_int = wquantizer.get_qparams(
                             tensor_range, w.device
                         )
                         args = {}
                         args['scales'] = scales
                         args['zeros'] = zeros
-                        args['qmax'] = qmax
-                        args['qmin'] = qmin
+                        args['max_int'] = max_int
+                        args['min_int'] = min_int
                         q_w = wquantizer.fake_quant_weight_static(w, args)
                     else:
                         raise Exception('Not support other clip version')
@@ -740,41 +698,87 @@ def auto_clip_layer(
         torch.cuda.empty_cache()
         return best_max_val.squeeze(1), best_min_val.squeeze(1)
 
+    def replace_rotate_fc(self, block, n, m, Q1=None, Q2=None, transpose=False):
+        args = {}
+        if hasattr(self, 'weight_rotate') and self.weight_rotate:
+            args['Q1'] = Q1
+            args['Q2'] = Q2
+            args['transpose'] = transpose
+
+        params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=n, args=args)
+        if params_dict == {}:
+            return
+
+        subset = {'layers': {n: m}}
+        self.model.replace_module_subset(
+            RotateLinear,
+            block,
+            subset,
+            self.block_idx,
+            params_dict
+        )
+
+    def replace_rotate_fcs(self, block):
+        for n, m in block.named_modules():
+            if isinstance(m, nn.Linear):
+               self.replace_rotate_fc(block, n, m)
+
+    def rotate_weight(self, weight, bias, Q, transpose):
+        dtype = weight.dtype
+        dev = weight.data.device
+        R_b = bias
+
+        W = weight.data.to(device=dev, dtype=torch.float64)
+        Q = Q.to(device=dev, dtype=torch.float64)
+        if not transpose:
+            R_W = torch.matmul(W, Q).to(device='cpu', dtype=dtype)
+        else:
+            R_W = torch.matmul(Q.T, W).to(device='cpu', dtype=dtype)
+            if bias is not None:
+                b = bias.data.to(device=dev, dtype=torch.float64)
+                R_b = torch.matmul(Q.T, b).to(device='cpu', dtype=dtype)
+
+        return R_W, R_b
+
     def rotate_pre_layers(self, pre_layers, Q):
+        transpose = False
         for layer in pre_layers:
-            dtype = layer.weight.dtype
-            device = layer.weight.data.device
-            W = layer.weight.data.to(device=device, dtype=torch.float64)
-            layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype)
+            layer.weight.data, _ = self.rotate_weight(layer.weight, None, Q, transpose)
 
     def rotate_post_layers(self, post_layers, Q, exact_had=False):
+        transpose = True
         for layer in post_layers:
-            dtype = layer.weight.dtype
-            device = layer.weight.data.device
-            W = layer.weight.data.to(device=device, dtype=torch.float64)
-            layer.weight.data = torch.matmul(Q.T, W).to(device='cpu', dtype=dtype)
+            weight = layer.weight
+            if hasattr(layer, 'bias') and layer.bias is not None:
+                bias = layer.bias
+            else:
+                bias = None
+            R_weight, R_bias = self.rotate_weight(
+                weight, bias, Q, transpose
+            )
+            layer.weight.data = R_weight
+            if bias is not None:
+                layer.bias.data = bias
 
             if exact_had and self.online_rotate:
                 apply_exact_had_to_linear(layer, had_dim=-1, output=False)
 
-            if hasattr(layer, 'bias') and layer.bias is not None:
-                b = layer.bias.data.to(device=device, dtype=torch.float64)
-                layer.bias.data = torch.matmul(Q.T, b).to(device='cpu', dtype=dtype)
-
     def rotate_embeddings(self, Q):
+        transpose = False
         embeddings = self.model.get_embed_layers()
         assert len(embeddings) == 1
         for layer in embeddings:
-            dtype = layer.weight.data.dtype
-            W = layer.weight.data.to(device=self.dev, dtype=torch.float64)
-            layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype)
+            layer.weight.data, _ = self.rotate_weight(
+                layer.weight, None, Q, transpose
+            )
 
     def rotate_head(self, Q):
+        transpose = False
         heads = self.model.get_head_layers()
         for layer in heads:
-            dtype = layer.weight.data.dtype
-            W = layer.weight.data.to(device=self.dev, dtype=torch.float64)
-            layer.weight.data = torch.matmul(W, Q).to(device='cpu', dtype=dtype)
+            layer.weight.data, _ = self.rotate_weight(
+                layer.weight, None, Q, transpose
+            )
 
     def fuse_ln_fcs(self, ln, fcs):
         for fc in fcs:
@@ -786,8 +790,7 @@ def fuse_ln_fcs(self, ln, fcs):
                     fc.bias = torch.nn.Parameter(
                         torch.zeros(fc.out_features, dtype=torch.float64)
                     )
-                fc.bias.data = fc.bias.data.double().to(device=W.device) \
-                    + torch.matmul(W, ln.bias.double())
+                fc.bias.data = fc.bias.data.double() + torch.matmul(W, ln.bias.double())
                 fc.bias.data = fc.bias.data.to(fc_dtype)
 
     def remove_mean_from_embed(self):
@@ -809,15 +812,15 @@ def bake_mean_into_fc(self, fc):
             fc.bias.data = fc.bias.data.to(fc_dtype)
 
     @torch.no_grad()
-    def deploy(self, quant_format, keep_device=False):
+    def deploy(self, quant_format):
         logger.info(f'-- deploy_{quant_format}_model start --')
         logger.info(f'quant_config : {self.quant_config}')
 
         module_mapping = {
+            'fake_quant': EffcientFakeQuantLinear,
+            'real_quant': RealQuantLinear,
             'origin_float': OriginFloatLinear,
-            'fake_quant': EffcientFakeQuantLinear
         }
-        module_mapping.update(_REALQUANT_LINEAR_MAP_)
 
         if quant_format not in module_mapping:
             raise NotImplementedError(
@@ -826,17 +829,14 @@ def deploy(self, quant_format, keep_device=False):
 
         module = module_mapping[quant_format]
         self.model.replace_module_all(
-            module,
-            self.get_replacement_params(mode=quant_format, w_only=self.w_only),
-            keep_device=keep_device
+            module, self.get_replacement_params(mode=quant_format, w_only=self.w_only)
         )
 
         logger.info(f'-- deploy_{quant_format}_model done --')
 
     @torch.no_grad()
     def copy_tokenizer(self, path):
-        for substring in self.config.save.get('tokenizer_file_substring',
-                                              ['token', 'merges', 'vocab']):
+        for substring in self.config.save.get('tokenizer_file_substring', ['token']):
             copy_files(self.config.model.path, path, substring)
         logger.info('copy tokenizer done --')
 
@@ -852,18 +852,11 @@ def contiguous_params(self):
 
     @torch.no_grad()
     def save_model(self, path):
-        if int(os.environ['RANK']) != 0:
-            return
-        self.contiguous_params()
-        if self.config.model.type in ['Llava', 'InternVL2']:
-            self.model.vlm_model.language_model = self.model.get_model()
-            self.model.vlm_model.save_pretrained(path)
-            logger.info('save model done --')
-            self.copy_tokenizer(path)
-            copy_files(self.config.model.path, path, 'preprocessor_config')
-        elif self.config.model.type in ['InternOmni']:
-            self.model.avlm_model.language_model = self.model.get_model()
-            self.model.avlm_model.save_pretrained(path)
+        if self.online_rotate:
+            self.contiguous_params()
+        if self.config.model.type == 'Llava':
+            self.model.llava_model.language_model = self.model.get_model()
+            self.model.llava_model.save_pretrained(path)
             logger.info('save model done --')
             self.copy_tokenizer(path)
             copy_files(self.config.model.path, path, 'preprocessor_config')
diff --git a/llmc/compression/quantization/dgq.py b/llmc/compression/quantization/dgq.py
index 4109065d5..823ba4862 100644
--- a/llmc/compression/quantization/dgq.py
+++ b/llmc/compression/quantization/dgq.py
@@ -8,13 +8,13 @@
 
 from .base_blockwise_quantization import BaseBlockwiseQuantization
 from .module_utils import _LLMC_LN_TYPES_, _TRANSFORMERS_LN_TYPES_
-from .quant import IntegerQuantizer
+from .quant import Quantizer
 
 
 @ALGO_REGISTRY
 class DGQ(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.model_dtype = next(self.model.model.parameters()).dtype
 
     def w_qdq(self, module, wquantizer):
@@ -28,8 +28,8 @@ def w_qdq(self, module, wquantizer):
         args = {}
         args['scales'] = s.reshape(-1, 1)
         args['zeros'] = zeros.reshape(-1, 1)
-        args['qmax'] = upper
-        args['qmin'] = lower
+        args['max_int'] = upper
+        args['min_int'] = lower
         # logger.info(f"s.shape : {s.shape}")
         # logger.info(f"scales.shape : {scales.shape}")
         # logger.info(f"zeros.shape : {zeros.shape}")
@@ -43,22 +43,21 @@ def set_quant_config(self):
             self.quant_out = True
         else:
             self.quant_out = False
-        self.quant_type = self.quant_config.get('quant_type', 'int_quant')
-        assert self.quant_type != 'float_quant', 'DGQ do not support Float quant now.'
+
         # set weight quant config
-        self.wquantizer_w4 = IntegerQuantizer(**self.quant_config['weight']['w_1'])
+        self.wquantizer_w4 = Quantizer(**self.quant_config['weight']['w_1'])
         perchannel_setting = {
             'bit': self.quant_config['weight']['w_1']['bit'],
             'symmetric': self.quant_config['weight']['w_1']['symmetric'],
             'granularity': 'per_channel',
         }
-        self.wquantizer_w4_perchannel = IntegerQuantizer(**perchannel_setting)
-        self.wquantizer_w8 = IntegerQuantizer(**self.quant_config['weight']['w_2'])
+        self.wquantizer_w4_perchannel = Quantizer(**perchannel_setting)
+        self.wquantizer_w8 = Quantizer(**self.quant_config['weight']['w_2'])
 
         # set act quant config
         if 'act' in self.quant_config and self.quant_config['act'] is not None:
             self.w_only = False
-            self.aquantizer = IntegerQuantizer(**self.quant_config['act'])
+            self.aquantizer = Quantizer(**self.quant_config['act'])
         else:
             self.w_only = True
 
@@ -191,12 +190,12 @@ def search_scale_zero_layer(self, layer, input_feat):
                     _,
                     scales,
                     zeros,
-                    qmax,
-                    qmin,
+                    max_int,
+                    min_int,
                 ) = self.wquantizer_w4_perchannel.get_tensor_qparams(weight_OxG)
                 # Perchannel do not need reshape and restore tensor.
                 weight_OxG_fq = self.wquantizer_w4_perchannel.quant_dequant(
-                    weight_OxG, scales, zeros, qmax, qmin
+                    weight_OxG, scales, zeros, max_int, min_int
                 )
                 if not self.w_only:
                     inp_LxG_fq = self.a_qdq(inp_LxG)
@@ -225,8 +224,8 @@ def search_scale_zero_layer(self, layer, input_feat):
                 _,
                 qscales_8,
                 zeros,
-                qmax,
-                qmin,
+                max_int,
+                min_int,
             ) = self.wquantizer_w8.get_tensor_qparams(
                 weight_tmp.clamp(-w_max * ratio, w_max * ratio)
             )
diff --git a/llmc/compression/quantization/gptq.py b/llmc/compression/quantization/gptq.py
index c8b0a9aea..f2f4319bd 100644
--- a/llmc/compression/quantization/gptq.py
+++ b/llmc/compression/quantization/gptq.py
@@ -17,8 +17,8 @@
 
 @ALGO_REGISTRY
 class GPTQ(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.dev = torch.device('cuda')
         self.model_dtype = next(self.model.model.parameters()).dtype
         self.add_quant_config()
@@ -64,7 +64,7 @@ def hessian_sorting(self, name):
 
         if self.actorder:
             perm = torch.cat(
-                [descending_ids[self.n_out:], descending_ids[:self.self.n_out]]
+                [descending_ids[self.n_out:], descending_ids[: self.self.n_out]]
             )
         else:
             perm = torch.cat(
@@ -98,20 +98,19 @@ def block_transform_true_sequential(self, block, input_feat):
             torch.cuda.empty_cache()
 
             self.subset_transform(subset['layers'])
-            if self.quant_out:
-                self.model.replace_module_subset(
-                    FakeQuantLinear,
-                    block,
-                    subset,
-                    self.block_idx,
-                    self.get_replacement_params('fake_quant', w_only=True),
-                )
+            self.model.replace_module_subset(
+                FakeQuantLinear,
+                block,
+                subset,
+                self.block_idx,
+                self.get_replacement_params('fake_quant', w_only=True),
+            )
 
     @torch.no_grad()
     def block_transform(self, block, input_feat, block_kwargs):
         logger.info(f'Start transform the {self.block_idx+1}-th block')
         if self.online_rotate:
-            self.replace_rotate_linears(block)
+            self.replace_rotate_fcs(block)
         if self.owq and not hasattr(self, 'n_out_dict'):
             named_linears = self.model.get_block_linears(block)
             self.n_out_dict = {}
@@ -149,7 +148,7 @@ def initialize_qparams_and_prepare_weights(self, layer, name):
         self.qparams = {}
         self.columns = self.layers_cache[name]['columns']
         self.n_out = self.n_out_dict[name] if self.owq else 0
-        self.n_nonout = self.columns - self.n_out
+        self.n_nonout = layer.weight.data.shape[1] - self.n_out
 
         if self.actorder or self.owq:
             self.hessian_sorting(name)
@@ -238,18 +237,14 @@ def weight_transform(self, W, Hinv, Losses, tmp):
 
             for i in range(count):
                 w, d = W1[:, i], Hinv1[i, i]
+                idx = i1 + i
+
                 if self.wquantizer.granularity == 'per_group':
-                    idx = i1 + i
-                    if not self.static_groups:
-                        if (i1 + i) % self.wquantizer.group_size == 0:
-                            column_tensors = W[
-                                :,
-                                (i1 + i):min(
-                                    (i1 + i + self.wquantizer.group_size),
-                                    (self.columns - self.n_out),
-                                ),
-                            ]
-                            self.search_column_qparams(column_tensors, idx)
+                    if not self.static_groups and idx % self.wquantizer.group_size == 0:
+                        col_end = min(
+                            idx + self.wquantizer.group_size, self.columns - self.n_out
+                        )
+                        self.search_column_qparams(W[:, idx:col_end], idx)
                     else:
                         if self.actorder:
                             idx = self.perm[idx]
@@ -259,8 +254,8 @@ def weight_transform(self, W, Hinv, Losses, tmp):
                     w.unsqueeze(1),
                     self.qparams['scale'],
                     self.qparams['zero'],
-                    self.qparams['qmax'],
-                    self.qparams['qmin'],
+                    self.qparams['max_int'],
+                    self.qparams['min_int'],
                 ).squeeze(1)
 
                 tmp1[:, i] = w
@@ -286,7 +281,7 @@ def add_batch(self, layer, name, inp, out):
         ):
             if isinstance(layer, RotateLinear):
                 # online rotate
-                inp = layer.rotater.rotate(inp)
+                inp = layer.a_rotater.rotate(inp)
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
@@ -346,21 +341,21 @@ def collect_model_qparams(self):
                     tensor,
                     scales,
                     zeros,
-                    qmax,
-                    qmin,
+                    max_int,
+                    min_int,
                 ) = self.wquantizer.get_tensor_qparams(m.weight.data)
                 m = m.to(self.model_dtype)
                 m.cpu()
                 m.register_buffer('buf_scales', scales)
                 m.register_buffer('buf_zeros', zeros)
-                m.register_buffer('buf_qmax', torch.tensor(qmax))
-                m.register_buffer('buf_qmin', torch.tensor(qmin))
+                m.register_buffer('buf_max_int', torch.tensor(max_int))
+                m.register_buffer('buf_min_int', torch.tensor(min_int))
 
     @torch.no_grad()
     def split_qparams(self, qparams):
         group_qparams = []
-        group_num = math.ceil(self.columns / self.wquantizer.group_size)
-        qparams = qparams.reshape(math.ceil(qparams.shape[0] / group_num), -1)
+        group_num = self.columns // self.wquantizer.group_size
+        qparams = qparams.reshape(qparams.shape[0] // group_num, -1)
         qparams = qparams.t()
         group_qparams = list(torch.split(qparams, 1, dim=0))
         for i in range(len(group_qparams)):
@@ -384,11 +379,11 @@ def merge_qparams(self, qparams):
 
     @torch.no_grad()
     def search_column_qparams(self, c_tensor, idx):
-        _, scale, zero, qmax, qmin = self.wquantizer.get_tensor_qparams(c_tensor)
+        _, scale, zero, max_int, min_int = self.wquantizer.get_tensor_qparams(c_tensor)
         self.qparams['scale'] = scale
         self.qparams['zero'] = zero
-        self.qparams['qmax'] = qmax
-        self.qparams['qmin'] = qmin
+        self.qparams['max_int'] = max_int
+        self.qparams['min_int'] = min_int
         qparams = copy.deepcopy(self.qparams)
         self.groups[idx // self.wquantizer.group_size] = qparams
 
@@ -397,28 +392,27 @@ def search_layer_qparams(self, layer):
         scales = layer.buf_scales
         zeros = layer.buf_zeros
         scales = self.merge_qparams(scales)
-        if not self.wquantizer.sym:
-            zeros = self.merge_qparams(zeros)
+        zeros = self.merge_qparams(zeros)
         self.qparams['scale'], self.qparams['zero'] = scales, zeros
-        self.qparams['qmax'] = layer.buf_qmax
-        self.qparams['qmin'] = layer.buf_qmin
+        self.qparams['max_int'] = layer.buf_max_int
+        self.qparams['min_int'] = layer.buf_min_int
 
     @torch.no_grad()
     def search_group_qparams(self, layer):
         scales = layer.buf_scales
         zeros = layer.buf_zeros
         self.group_scales = self.split_qparams(scales)
-        if not self.wquantizer.sym:
+        if zeros is not None:
             self.group_zeros = self.split_qparams(zeros)
         for i in range(len(self.group_scales)):
             qparams = {}
             qparams['scale'] = self.group_scales[i]
-            if not self.wquantizer.sym:
+            if zeros is not None:
                 qparams['zero'] = self.group_zeros[i]
             else:
-                qparams['zero'] = torch.tensor(0.0)
-            qparams['qmax'] = layer.buf_qmax
-            qparams['qmin'] = layer.buf_qmin
+                qparams['zero'] = None
+            qparams['max_int'] = layer.buf_max_int
+            qparams['min_int'] = layer.buf_min_int
             self.groups.append(qparams)
 
     @torch.no_grad()
@@ -429,11 +423,9 @@ def update_model_qparams(self, layer):
             _scales.append(g['scale'])
             _zeros.append(g['zero'])
         scales = self.merge_qparams(_scales)
+        zeros = self.merge_qparams(_zeros)
         layer.buf_scales = copy.deepcopy(scales)
-
-        if not self.wquantizer.sym:
-            zeros = self.merge_qparams(_zeros)
-            layer.buf_zeros = copy.deepcopy(zeros)
+        layer.buf_zeros = copy.deepcopy(zeros)
 
     @torch.no_grad()
     def w_q(self, module, wquantizer):
@@ -441,8 +433,8 @@ def w_q(self, module, wquantizer):
         args = {}
         args['scales'] = module.buf_scales
         args['zeros'] = module.buf_zeros
-        args['qmax'] = module.buf_qmax
-        args['qmin'] = module.buf_qmin
+        args['max_int'] = module.buf_max_int
+        args['min_int'] = module.buf_min_int
         args['scales'] = args['scales'].to(self.model_dtype)
 
         weight, scales, zeros = wquantizer.real_quant_weight_static(weight, args)
@@ -461,8 +453,8 @@ def w_qdq(self, module, wquantizer):
             args['zeros'] = module.buf_zeros
         else:
             args['zeros'] = None
-        args['qmax'] = module.buf_qmax
-        args['qmin'] = module.buf_qmin
+        args['max_int'] = module.buf_max_int
+        args['min_int'] = module.buf_min_int
 
         if self.owq:
             fp_weight = weight[:, module.buf_n_nonout:]
@@ -480,7 +472,7 @@ def w_qdq(self, module, wquantizer):
 
     @torch.no_grad()
     def deploy(self, quant_format):
-        if quant_format not in ['fake_quant', 'origin_float']:
+        if quant_format == 'real_quant':
             assert not self.need_perm
         super().deploy(quant_format)
         self.model.convert_dtype(self.model_dtype)
diff --git a/llmc/compression/quantization/hadamard_utils.py b/llmc/compression/quantization/hadamard_utils.py
index 2a5f4b144..81c1e15d5 100644
--- a/llmc/compression/quantization/hadamard_utils.py
+++ b/llmc/compression/quantization/hadamard_utils.py
@@ -11,11 +11,23 @@
         'If you need it, please install it firstly.'
     )
 
-# from .module_utils import RotateLinear
 # Adapted from
 # https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py
 
 
+class HadamardTransform(torch.autograd.Function):
+    """The unnormalized Hadamard transform (i.e. without dividing by
+    sqrt(2))"""
+
+    @staticmethod
+    def forward(ctx, u):
+        return fast_hadamard_transform.hadamard_transform(u)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return fast_hadamard_transform.hadamard_transform(grad)
+
+
 def get_hadK(n, transpose=False):
     hadK, K = None, None
     if n % 172 == 0:  # llama-2-7b up
@@ -109,25 +121,68 @@ def random_hadamard_matrix(size, device):
 def matmul_hadU_cuda(X, hadK, K):
     n = X.shape[-1]
     if K == 1:
-        return fast_hadamard_transform.hadamard_transform(
-            X.contiguous(), 1.0 / torch.tensor(n).sqrt()
-        )
+        return HadamardTransform.apply(X.contiguous()) / torch.tensor(n).sqrt()
     # if transpose:
     #     hadK = hadK.T.contiguous()
     input = X.view(-1, K, n // K)
-    input = fast_hadamard_transform.hadamard_transform(
-        input.contiguous(), 1.0 / torch.tensor(n).sqrt()
-    )
+    input = HadamardTransform.apply(input.contiguous()) / torch.tensor(n).sqrt()
     input = hadK.to(input.device).to(input.dtype) @ input
     return input.reshape(X.shape)
 
-
 def matmul_hadUt_cuda(X, hadK, K):
     return matmul_hadU_cuda(X, hadK, K, transpose=True)
 
 
-def apply_exact_had_to_linear(module, had_dim=-1, output=False):
-    # assert isinstance(module, (torch.nn.Linear, RotateLinear))
+# def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None):
+#     # assert isinstance(module, (torch.nn.Linear, RotateLinear))
+#     in_features, out_features = module.in_features, module.out_features
+
+#     if had_dim != -1:
+#         assert is_pow2(had_dim), 'Hadamard dimension must be a power of 2!'
+
+#     W_ = module.weight.data
+#     dtype = W_.dtype
+#     dev = W_.device
+#     # init_shape = W_.shape
+#     W_ = W_.float().cuda()
+
+#     if had_dim == -1:
+#         if output:
+#             had_K, K = get_hadK(out_features)
+#             W_ = matmul_hadU_cuda(W_.t(), had_K, K).t()
+#         if not output:
+#             had_K, K = get_hadK(in_features)
+#             W_ = matmul_hadU_cuda(W_, had_K, K)
+#     else:
+#         # Apply Hadamard to the last had_dim chunks of the weights
+#         if output:
+#             W_ = W_.t()
+#             transposed_shape = W_.shape
+#             W_ = (
+#                 fast_hadamard_transform.hadamard_transform(
+#                     W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim),
+#                     scale=1 / math.sqrt(had_dim),
+#                 )
+#                 .reshape(transposed_shape)
+#                 .t()
+#             )
+#         else:
+#             raise NotImplementedError('Not implemented (or tested) yet!')
+#             # n = W_.shape[1]
+#             # W_ = hadamard_transform(
+#             #     W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim)
+#             # ).reshape(init_shape)
+#     module.weight.data = W_.to(device=dev, dtype=dtype)
+
+
+def hadamard_matrix(size, device):
+    # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation"
+    Q = torch.eye(size)
+    return matmul_hadU(Q).to(device)
+
+
+def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None):
+    # assert isinstance(module, torch.nn.Linear)
     in_features, out_features = module.in_features, module.out_features
 
     if had_dim != -1:
@@ -136,7 +191,7 @@ def apply_exact_had_to_linear(module, had_dim=-1, output=False):
     W_ = module.weight.data
     dtype = W_.dtype
     dev = W_.device
-    # init_shape = W_.shape
+    init_shape = W_.shape
     W_ = W_.float().cuda()
 
     if had_dim == -1:
@@ -147,27 +202,22 @@ def apply_exact_had_to_linear(module, had_dim=-1, output=False):
             had_K, K = get_hadK(in_features)
             W_ = matmul_hadU_cuda(W_, had_K, K)
     else:
-        # Apply Hadamard to the last had_dim chunks of the weights
+        hadK = hadamard_matrix(had_dim, 'cuda').to(torch.float64)
+        if R2 is not None:
+            hadK = R2.to(torch.float64)
         if output:
             W_ = W_.t()
             transposed_shape = W_.shape
-            W_ = (
-                fast_hadamard_transform.hadamard_transform(
-                    W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim),
-                    scale=1 / math.sqrt(had_dim),
-                )
-                .reshape(transposed_shape)
-                .t()
-            )
+            temp = W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim)
+            temp = temp.to(torch.float64) @ hadK
+            W_ = temp.reshape(transposed_shape).t()
         else:
-            raise NotImplementedError('Not implemented (or tested) yet!')
-            # n = W_.shape[1]
-            # W_ = hadamard_transform(
-            #     W_.reshape(-1, n // had_dim, had_dim), scale=1 / math.sqrt(had_dim)
-            # ).reshape(init_shape)
+            init_shape = W_.shape
+            temp = W_.reshape(-1, init_shape[-1] // had_dim, had_dim)
+            temp = temp.to(torch.float64) @ hadK
+            W_ = temp.reshape(init_shape)
     module.weight.data = W_.to(device=dev, dtype=dtype)
 
-
 def is_pow2(n):
     return (n & (n - 1) == 0) and (n > 0)
 
diff --git a/llmc/compression/quantization/hqq.py b/llmc/compression/quantization/hqq.py
index 0077c401b..0784dfc47 100644
--- a/llmc/compression/quantization/hqq.py
+++ b/llmc/compression/quantization/hqq.py
@@ -11,8 +11,8 @@
 
 @ALGO_REGISTRY
 class HQQ(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.add_quant_config()
 
     @torch.no_grad()
@@ -34,13 +34,13 @@ def add_quant_config(self):
             )
 
     @torch.no_grad()
-    def optimize_weights_proximal(self, W_f, scales, zeros, qmax, qmin):
+    def optimize_weights_proximal(self, W_f, scales, zeros, max_int, min_int):
         best_error = 1e4
         current_beta = self.beta
         current_kappa = self.kappa
         scales = 1 / scales
         for i in range(self.iters):
-            W_q = torch.round(W_f * scales + zeros).clamp(qmin, qmax)
+            W_q = torch.round(W_f * scales + zeros).clamp(min_int, max_int)
             W_r = (W_q - zeros) / scales
             W_e = self.shrink_op(W_f - W_r, current_beta)
 
@@ -77,17 +77,17 @@ def block_opt(self, block):
                 tensor,
                 org_scales,
                 org_zeros,
-                qmax,
-                qmin,
+                max_int,
+                min_int,
             ) = self.wquantizer.get_tensor_qparams(tensor)
 
             best_scales, best_zeros = self.optimize_weights_proximal(
-                tensor, org_scales, org_zeros, qmax, qmin
+                tensor, org_scales, org_zeros, max_int, min_int
             )
             layer.register_buffer('buf_scales', best_scales)
             layer.register_buffer('buf_zeros', best_zeros)
-            layer.register_buffer('buf_qmax', torch.tensor(qmax))
-            layer.register_buffer('buf_qmin', torch.tensor(qmin))
+            layer.register_buffer('buf_max_int', torch.tensor(max_int))
+            layer.register_buffer('buf_min_int', torch.tensor(min_int))
 
         block = block.cpu()
         gc.collect()
@@ -99,7 +99,7 @@ def w_qdq(self, module, wquantizer):
             args['dim'] = 'ic'
         args['scales'] = module.buf_scales
         args['zeros'] = module.buf_zeros
-        args['qmax'] = module.buf_qmax
-        args['qmin'] = module.buf_qmin
+        args['max_int'] = module.buf_max_int
+        args['min_int'] = module.buf_min_int
 
         return wquantizer.fake_quant_weight_static(module.weight, args)
diff --git a/llmc/compression/quantization/llmint8.py b/llmc/compression/quantization/llmint8.py
index 29209f63a..18b6fb9a9 100644
--- a/llmc/compression/quantization/llmint8.py
+++ b/llmc/compression/quantization/llmint8.py
@@ -9,8 +9,8 @@
 
 @ALGO_REGISTRY
 class LlmInt8(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.add_quant_config()
 
     @torch.no_grad()
diff --git a/llmc/compression/quantization/module_utils.py b/llmc/compression/quantization/module_utils.py
index 1ce25676a..efe9ff843 100644
--- a/llmc/compression/quantization/module_utils.py
+++ b/llmc/compression/quantization/module_utils.py
@@ -1,31 +1,24 @@
+import gc
 import math
 from functools import partial
 
-import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from loguru import logger
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+from transformers.models.mistral.modeling_mistral import MistralRMSNorm
+from transformers.models.mixtral.modeling_mixtral import MixtralRMSNorm
+from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 
-try:
-    import fast_hadamard_transform
-
-    from .hadamard_utils import matmul_hadU_cuda
-except Exception:
-    logger.info(
-        'fast_hadamard_transform not installed. '
-        'If you need it, please install it firstly.'
-    )
-
-from .utils import calculate_zeros_width
-
 
 class LlmcLayerNorm(nn.Module):
     def __init__(self, weight, bias, eps, normalized_shape, elementwise_affine):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         if bias is not None:
-            self.register_buffer('bias', bias)
+            self.register_buffer("bias", bias)
         else:
             self.bias = None
         self.eps = eps
@@ -62,16 +55,16 @@ def new(cls, module):
 
     def __repr__(self):
         return (
-            f'LlmcLayerNorm({self.normalized_shape},'
-            f'eps={self.eps},'
-            f'elementwise_affine={self.elementwise_affine})'
+            f"LlmcLayerNorm({self.normalized_shape},"
+            f"eps={self.eps},"
+            f"elementwise_affine={self.elementwise_affine})"
         )
 
 
 class LlmcLlamaRMSNorm(nn.Module):
     def __init__(self, weight, eps=1e-6):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         self.bias = None
         self.variance_epsilon = eps
         self.use_tmp_parameter = False
@@ -82,10 +75,10 @@ def forward(self, hidden_states):
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
         if self.use_tmp_parameter:
             weight = self.tmp_weight
-            bias = self.tmp_bias if hasattr(self, 'tmp_bias') else None
+            bias = self.tmp_bias if hasattr(self, "tmp_bias") else None
         else:
             weight = self.weight
-            bias = self.bias if hasattr(self, 'bias') else None
+            bias = self.bias if hasattr(self, "bias") else None
 
         return (
             (weight * hidden_states + bias).to(input_dtype)
@@ -102,7 +95,7 @@ def new(cls, module):
         return new_module
 
     def __repr__(self):
-        return 'LlmcLlamaRMSNorm()'
+        return "LlmcLlamaRMSNorm()"
 
 
 class LlmcRMSNorm(nn.Module):
@@ -120,16 +113,13 @@ def forward(self, hidden_states):
     @classmethod
     @torch.no_grad()
     def new(cls, module):
-        if hasattr(module, 'eps'):
-            eps = module.eps
-        else:
-            eps = module.variance_epsilon
+        eps = module.variance_epsilon
         weight = module.weight
         new_module = cls(weight, eps)
         return new_module
 
     def __repr__(self):
-        return 'LlmcRMSNorm()'
+        return "LlmcRMSNorm()"
 
 
 class LlmcQwen2RMSNorm(LlmcLlamaRMSNorm):
@@ -137,7 +127,7 @@ def __init__(self, weight, eps=1e-6):
         super().__init__(weight, eps)
 
     def __repr__(self):
-        return 'LlmcQwen2RMSNorm()'
+        return "LlmcQwen2RMSNorm()"
 
 
 class LlmcMixtralRMSNorm(LlmcLlamaRMSNorm):
@@ -145,7 +135,7 @@ def __init__(self, weight, eps=1e-6):
         super().__init__(weight, eps)
 
     def __repr__(self):
-        return 'LlmcMixtralRMSNorm()'
+        return "LlmcMixtralRMSNorm()"
 
 
 class LlmcMistralRMSNorm(LlmcLlamaRMSNorm):
@@ -153,7 +143,7 @@ def __init__(self, weight, eps=1e-6):
         super().__init__(weight, eps)
 
     def __repr__(self):
-        return 'LlmcMistralRMSNorm()'
+        return "LlmcMistralRMSNorm()"
 
 
 class LlmcInternLM2RMSNorm(LlmcLlamaRMSNorm):
@@ -161,44 +151,74 @@ def __init__(self, weight, eps=1e-6):
         super().__init__(weight, eps)
 
     def __repr__(self):
-        return 'LlmcInternLM2RMSNorm()'
-
-
-class LlmcGemma2RMSNorm(LlmcLlamaRMSNorm):
-    def __init__(self, weight, eps=1e-6):
-        super().__init__(weight, eps)
-
-    def __repr__(self):
-        return 'LlmcGemma2RMSNorm()'
+        return "LlmcInternLM2RMSNorm()"
+
+
+class OriginEmbedding(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim, padding_idx,
+                 max_norm, norm_type, scale_grad_by_freq,
+                 sparse, weight):
+        super(OriginEmbedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        self.weight = weight
+
+    def forward(self, input):
+        return F.embedding(
+            input, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+        
+    @classmethod
+    @torch.no_grad()
+    def new(cls, module):
 
+        num_embeddings = module.num_embeddings
+        embedding_dim = module.embedding_dim
+        padding_idx = module.padding_idx
+        max_norm = module.max_norm
+        norm_type = module.norm_type
+        scale_grad_by_freq = module.scale_grad_by_freq
+        sparse = module.sparse
+        weight = module.weight
 
-class LlmcMiniCPMRMSNorm(LlmcLlamaRMSNorm):
-    def __init__(self, weight, eps=1e-6):
-        super().__init__(weight, eps)
+        new_module = cls(num_embeddings, embedding_dim, padding_idx,
+                 max_norm, norm_type, scale_grad_by_freq,
+                 sparse, weight)
+        return new_module
 
     def __repr__(self):
-        return 'LlmcMiniCPMRMSNorm()'
+        return (
+            f"OriginEmbedding({self.num_embeddings}, "
+            f"{self.embedding_dim}, "
+            f"padding_idx={self.padding_idx}),"
+        )
 
 
 class OriginFloatLinear(nn.Module):
     def __init__(self, weight, bias, ori_module):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         if bias is not None:
-            self.register_buffer('bias', bias)
+            self.register_buffer("bias", bias)
         else:
             self.bias = None
 
         for name, buf in ori_module.named_buffers():
-            if name.startswith('buf_'):
+            if name.startswith("buf_"):
                 self.register_buffer(name, buf.data)
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            self.rotater = ori_module.rotater
+
+        if getattr(self, "buf_a_rotate", False):
+            self.a_rot = ori_module.a_rot
 
     @torch.no_grad()
     def forward(self, x):
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            x = self.rotater.rotate(x)
+        if hasattr(self, "a_rot"):
+            x = self.a_rot(x, self)
         x = torch.functional.F.linear(x, self.weight, self.bias)
         return x
 
@@ -222,98 +242,112 @@ def new(cls, module):
 
     def __repr__(self):
         return (
-            f'OriginFloatLinear(in_features={self.in_features},'
-            f'out_features={self.out_features},'
-            f'bias={self.bias is not None})'
+            f"OriginFloatLinear(in_features={self.in_features}, "
+            f"out_features={self.out_features}, "
+            f"buf_a_rotate={self.buf_a_rotate}, "
+            f"bias={self.bias is not None})"
         )
 
 
-class Rotater:
-    def __init__(
-        self, online_full_had, online_partial_had, fp32_had, K, had_K=None, had_dim=None
-    ):
-        self.online_full_had = online_full_had
-        self.online_partial_had = online_partial_had
-        self.fp32_had = fp32_had
-        self.K = K
-        self.had_K = had_K
-        self.had_dim = had_dim
-
-    def rotate(self, x):
-        x_dtype = x.dtype
-
-        if self.online_full_had:
-            if self.fp32_had:
-                x = matmul_hadU_cuda(x.float(), self.had_K, self.K).to(x_dtype)
-            else:
-                x = matmul_hadU_cuda(x, self.had_K, self.K)
-
-        elif self.online_partial_had:
-            if self.fp32_had:
-                x = x.float()
-            init_shape = x.shape
-            if self.K == 1:
-                x = fast_hadamard_transform.hadamard_transform(
-                    x.reshape(
-                        -1, init_shape[-1] // self.had_dim, self.had_dim
-                    ).transpose(1, 2),
-                    scale=1 / math.sqrt(init_shape[-1] // self.had_dim),
-                ).transpose(1, 2)
-            else:
-                self.had_K = self.had_K.to(x.device)
-
-                x = (
-                    self.had_K.to(x.dtype)
-                    @ x.reshape(-1, init_shape[-1] // self.had_dim, self.had_dim)
-                ) / math.sqrt(init_shape[-1] // self.had_dim)
-
-            if self.fp32_had:
-                x = x.to(x_dtype)
-            x = x.reshape(init_shape)
+class RotateEmbedding(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim, padding_idx,
+                 max_norm, norm_type, scale_grad_by_freq,
+                 sparse, weight, w_rot):
+        super(RotateEmbedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        self.weight = weight
+        self.bias = None
+        self.w_rot = w_rot
+
+    def forward(self, input):
+    
+        tmp_weight = self._rotate_weight()
+
+        return F.embedding(
+            input, tmp_weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+    
+    def _rotate_weight(self):
+        if self.w_rot is not None:
+            tmp_weight, _ = self.w_rot(self)
+        else:
+            tmp_weight = self.weight
+        return tmp_weight
+        
+    @classmethod
+    @torch.no_grad()
+    def new(cls, module, w_rot):
+
+        num_embeddings = module.num_embeddings
+        embedding_dim = module.embedding_dim
+        padding_idx = module.padding_idx
+        max_norm = module.max_norm
+        norm_type = module.norm_type
+        scale_grad_by_freq = module.scale_grad_by_freq
+        sparse = module.sparse
+        weight = module.weight
 
-        return x
+        new_module = cls(num_embeddings, embedding_dim, padding_idx,
+                 max_norm, norm_type, scale_grad_by_freq,
+                 sparse, weight, w_rot)
+        return new_module
+
+    def __repr__(self):
+        return (
+            f"RotateEmbedding({self.num_embeddings}, "
+            f"{self.embedding_dim}, "
+            f"w_rotate={self.w_rot is not None}, "
+            f"padding_idx={self.padding_idx})"
+        )
 
 
 class RotateLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-        ori_module,
-        online_full_had,
-        online_partial_had,
-        fp32_had,
-        K,
-        had_K,
-        had_dim,
-    ):
+    def __init__(self, weight, bias, ori_module, w_rot, a_rot):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         if bias is not None:
-            self.register_buffer('bias', bias)
+            self.register_buffer("bias", bias)
         else:
             self.bias = None
 
         for name, buf in ori_module.named_buffers():
-            if name.startswith('buf_'):
+            if name.startswith("buf_"):
                 self.register_buffer(name, buf.data)
 
-        self.rotater = Rotater(
-            online_full_had, online_partial_had, fp32_had, K, had_K, had_dim
-        )
-        self.register_buffer('buf_rotate', torch.tensor(True))
+        self.w_rot = w_rot
+        self.a_rot = a_rot
+
+        self.register_buffer("buf_w_rotate", torch.tensor(w_rot is not None))
+        self.register_buffer("buf_a_rotate", torch.tensor(a_rot is not None))
 
     def forward(self, x):
-        x = self.rotater.rotate(x)
-        x = torch.functional.F.linear(x, self.weight, self.bias)
 
+        if self.buf_a_rotate:
+            x = self.a_rot(x, self)
+
+        if self.buf_w_rotate:
+            tmp_weight, tmp_bias = self._rotate_weight()
+            self.register_buffer("tmp_weight", tmp_weight, persistent=False)
+            self.register_buffer("tmp_bias", tmp_bias, persistent=False)
+
+        weight = getattr(self, "tmp_weight", self.weight)
+        bias = getattr(self, "tmp_bias", self.bias)
+        x = torch.functional.F.linear(x, weight, bias)
         return x
+    
+    def _rotate_weight(self):
+        tmp_weight, tmp_bias = self.w_rot(self)
+        return tmp_weight, tmp_bias
 
     @classmethod
     @torch.no_grad()
-    def new(
-        cls, module, online_full_had, online_partial_had, fp32_had, K, had_K, had_dim
-    ):
+    def new(cls, module, w_rot, a_rot):
         weight = module.weight.data
         if module.bias is not None:
             bias = module.bias.data
@@ -324,14 +358,9 @@ def new(
             weight,
             bias,
             ori_module=module,
-            online_full_had=online_full_had,
-            online_partial_had=online_partial_had,
-            fp32_had=fp32_had,
-            K=K,
-            had_K=had_K,
-            had_dim=had_dim,
+            w_rot=w_rot,
+            a_rot=a_rot
         )
-
         new_module.in_features = module.in_features
         new_module.out_features = module.out_features
         return new_module
@@ -342,67 +371,72 @@ def get_func_name(cls, any_callable):
             return any_callable.func.__name__
         return any_callable.__name__
 
-    def register_activation_parameters(self, named_parameters):
-        pass
-
     def __repr__(self):
         return (
-            f'RotateLinear(in_features={self.in_features},'
-            f'out_features={self.out_features},'
-            f'bias={self.bias is not None},'
-            f'online_rotate={self.buf_rotate})'
+            f"RotateLinear(in_features={self.in_features}, "
+            f"out_features={self.out_features}, "
+            f"bias={self.bias is not None}, "
+            f"w_rotate={self.buf_w_rotate}, "
+            f"a_rotate={self.buf_a_rotate})"
         )
 
 
 class FakeQuantLinear(nn.Module):
-    def __init__(self, weight, bias, ori_module, w_qdq, a_qdq):
+    def __init__(self, weight, bias, ori_module, w_qdq, a_qdq, w_rot, a_rot):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         if bias is not None:
-            self.register_buffer('bias', bias)
+            self.register_buffer("bias", bias)
         else:
             self.bias = None
         self.a_qdq = a_qdq
         self.w_qdq = w_qdq
 
         for name, buf in ori_module.named_buffers():
-            if name.startswith('buf_'):
-                self.register_buffer(name, buf.data)
-        for name, buf in ori_module.named_parameters():
-            if name.startswith('buf_'):
+            if name.startswith("buf_"):
                 self.register_buffer(name, buf.data)
 
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            self.rotater = ori_module.rotater
-        else:
-            self.buf_rotate = False
+        if getattr(self, "buf_w_rotate", False):
+            self.w_rot = w_rot
+        if getattr(self, "buf_a_rotate", False):
+            self.a_rot = a_rot
 
         self.dynamic_quant_weight = False
         self.dynamic_quant_tmp_weight = False
 
     def forward(self, x):
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            x = self.rotater.rotate(x)
+        if hasattr(self, "a_rot"):
+            x = self.a_rot(x, self)
 
         if self.a_qdq is not None:
             x = self.a_qdq(x, self)
 
-        if not hasattr(self, 'tmp_weight'):
-            tmp_weight = self.w_qdq(self)
-            self.register_buffer('tmp_weight', tmp_weight, persistent=False)
-            self.tmp_bias = self.bias
-
-        elif self.dynamic_quant_weight:
+        if hasattr(self, "w_rot") and self.w_rot is not None:
+            tmp_weight, tmp_bias = self._rotate_weight()
+            self.register_buffer("tmp_weight", tmp_weight, persistent=False)
+            self.register_buffer("tmp_bias", tmp_bias, persistent=False)
             self.tmp_weight = self.w_qdq(self)
-            self.tmp_bias = self.bias
 
-        elif self.dynamic_quant_tmp_weight:
-            self.tmp_weight = self.w_qdq(self)
+        else:
+            if not hasattr(self, "tmp_weight"):
+                tmp_weight = self.w_qdq(self)
+                self.register_buffer("tmp_weight", tmp_weight, persistent=False)
+                self.tmp_bias = self.bias
 
-        x = torch.functional.F.linear(x, self.tmp_weight, self.tmp_bias)
+            elif self.dynamic_quant_weight:
+                self.tmp_weight = self.w_qdq(self)
+                self.tmp_bias = self.bias
+
+            elif self.dynamic_quant_tmp_weight:
+                self.tmp_weight = self.w_qdq(self)
 
+        x = torch.functional.F.linear(x, self.tmp_weight, self.tmp_bias)
         return x
 
+    def _rotate_weight(self):
+        tmp_weight, tmp_bias = self.w_rot(self)
+        return tmp_weight, tmp_bias
+
     @classmethod
     @torch.no_grad()
     def new(cls, module, w_qdq, a_qdq):
@@ -411,14 +445,15 @@ def new(cls, module, w_qdq, a_qdq):
             bias = module.bias.data
         else:
             bias = None
+        
 
-        new_module = cls(weight, bias, ori_module=module, w_qdq=w_qdq, a_qdq=a_qdq)
+        new_module = cls(weight, bias, ori_module=module, w_qdq=w_qdq, a_qdq=a_qdq, w_rot=module.w_rot, a_rot=module.a_rot)
 
         new_module.in_features = module.in_features
         new_module.out_features = module.out_features
         new_module.w_qdq_name = cls.get_func_name(w_qdq)
         new_module.a_qdq_name = (
-            cls.get_func_name(a_qdq) if a_qdq is not None else 'None'
+            cls.get_func_name(a_qdq) if a_qdq is not None else "None"
         )
         return new_module
 
@@ -433,37 +468,36 @@ def register_activation_parameters(self, named_parameters):
 
     def __repr__(self):
         return (
-            f'FakeQuantLinear(in_features={self.in_features},'
-            f'out_features={self.out_features}, bias={self.bias is not None},'
-            f'weight_quant={self.w_qdq_name},'
-            f'act_quant={self.a_qdq_name},'
-            f'online_rotate={self.buf_rotate})'
+            f"FakeQuantLinear(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.bias is not None},"
+            f"weight_quant={self.w_qdq_name}, "
+            f"act_quant={self.a_qdq_name}, "
+            f"w_rotate={self.buf_w_rotate}, "
+            f"a_rotate={self.buf_a_rotate},"
         )
 
 
 class EffcientFakeQuantLinear(nn.Module):
     def __init__(self, weight, bias, ori_module, a_qdq):
         super().__init__()
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         if bias is not None:
-            self.register_buffer('bias', bias)
+            self.register_buffer("bias", bias)
         else:
             self.bias = None
         self.a_qdq = a_qdq
 
         for name, buf in ori_module.named_buffers():
-            if name.startswith('buf_'):
+            if name.startswith("buf_"):
                 self.register_buffer(name, buf.data)
 
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            self.rotater = ori_module.rotater
-        else:
-            self.buf_rotate = False
+        if getattr(self, "buf_a_rotate", False):
+            self.a_rot = ori_module.a_rot
 
     @torch.no_grad()
     def forward(self, x):
-        if hasattr(self, 'buf_rotate') and self.buf_rotate:
-            x = self.rotater.rotate(x)
+        if hasattr(self, "a_rot"):
+            x = self.a_rot(x, self)
 
         if self.a_qdq is not None:
             x = self.a_qdq(x, self)
@@ -473,6 +507,10 @@ def forward(self, x):
     @classmethod
     @torch.no_grad()
     def new(cls, module, w_qdq, a_qdq, debug_print={}):
+
+        if hasattr(module, "w_rot") and module.w_rot is not None:
+            weight, bias = module.w_rot(module)
+
         weight = w_qdq(module)
 
         if module.bias is not None:
@@ -486,7 +524,7 @@ def new(cls, module, w_qdq, a_qdq, debug_print={}):
         new_module.out_features = module.out_features
         new_module.w_qdq_name = cls.get_func_name(w_qdq)
         new_module.a_qdq_name = (
-            cls.get_func_name(a_qdq) if a_qdq is not None else 'None'
+            cls.get_func_name(a_qdq) if a_qdq is not None else "None"
         )
         new_module.debug_print = debug_print
         return new_module
@@ -499,146 +537,29 @@ def get_func_name(cls, any_callable):
 
     def __repr__(self):
         return (
-            f'EffcientFakeQuantLinear(in_features={self.in_features},'
-            f'out_features={self.out_features},'
-            f'bias={self.bias is not None},'
-            f'weight_quant={self.w_qdq_name},'
-            f'act_quant={self.a_qdq_name},'
-            f'online_rotate={self.buf_rotate},'
-            f'debug_print={self.debug_print})'
-        )
-
-
-class VllmRealQuantLinear(nn.Module):
-    def __init__(self, weight, bias, scales, need_pack):
-        super().__init__()
-        weight_name = 'weight_packed' if need_pack else 'weight'
-        self.register_buffer(weight_name, weight)
-
-        (
-            self.register_buffer('bias', bias)
-            if bias is not None
-            else setattr(self, 'bias', None)
+            f"EffcientFakeQuantLinear(in_features={self.in_features}, "
+            f"out_features={self.out_features}, "
+            f"bias={self.bias is not None}, "
+            f"weight_quant={self.w_qdq_name}, "
+            f"act_quant={self.a_qdq_name}, "
+            f"debug_print={self.debug_print})"
         )
 
-        self.register_buffer('weight_scale', scales)
-
-    @torch.no_grad()
-    def forward(self, x):
-        raise NotImplementedError
-
-    @classmethod
-    @torch.no_grad()
-    def new(cls, module, w_q, quant_config):
-        weight, scales = cls.quant_pack(module, w_q, quant_config)
-        if module.bias is not None:
-            bias = module.bias.data
-        else:
-            bias = None
 
-        need_pack = quant_config['weight'].get('need_pack', False)
-        new_module = cls(weight, bias, scales, need_pack)
-        new_module.in_features = module.in_features
-        new_module.out_features = module.out_features
-        new_module.weight_shape = weight.shape
-        new_module.weight_dtype = weight.dtype
-        new_module.scales_shape = scales.shape
-        new_module.scales_dtype = scales.dtype
-
-        new_module.zeros_shape = None
-        new_module.zeros_dtype = None
-
-        return new_module
-
-    @classmethod
-    @torch.no_grad()
-    def quant_pack(cls, module, w_q, quant_config):
-        weight, scales, zeros = w_q(module)
-        need_pack = quant_config['weight'].get('need_pack', False)
-        if need_pack:
-            weight, scales = cls.pack(weight, scales, quant_config)
-        return weight, scales
-
-    @classmethod
-    @torch.no_grad()
-    def pack(self, weight, scales, quant_config):
-
-        # Packs a tensor of quantized weights stored in int8 into int32s with padding
-        scales = scales.to(torch.float16)
-        num_bits = quant_config['weight']['bit']
-
-        # convert to unsigned for packing
-        offset = pow(2, num_bits) // 2
-        weight = (weight + offset).to(torch.uint8)
-        weight = weight.cpu().numpy().astype(np.uint32)
-        pack_factor = 32 // num_bits
-
-        # pad input tensor and initialize packed output
-        packed_size = math.ceil(weight.shape[1] / pack_factor)
-        packed = np.zeros((weight.shape[0], packed_size), dtype=np.uint32)
-        padding = packed.shape[1] * pack_factor - weight.shape[1]
-        weight = np.pad(weight, pad_width=[(0, 0), (0, padding)], constant_values=0)
-
-        # pack values
-        for i in range(pack_factor):
-            packed |= weight[:, i::pack_factor] << num_bits * i
-
-        packed = np.ascontiguousarray(packed).view(np.int32)
-        int_weight = torch.from_numpy(packed)
-        return int_weight, scales
-
-    def __repr__(self):
-        return (
-            'VllmRealQuantLinear('
-            + f'in_features={self.in_features}, '
-            + f'out_features={self.out_features}, '
-            + f'bias={self.bias is not None}, '
-            + f'weight_shape={self.weight_shape}, '
-            + f'weight_dtype={self.weight_dtype}, '
-            + f'scales_shape={self.scales_shape}, '
-            + f'scales_dtype={self.scales_dtype}, '
-            + f'zeros_shape={self.zeros_shape}, '
-            + f'zeros_dtype={self.zeros_dtype})'
-        )
-
-
-class SglRealQuantLinear(VllmRealQuantLinear):
-    def __init__(self, weight, bias, scales, need_pack):
-        super().__init__(weight, bias, scales, need_pack)
-
-    def __repr__(self):
-        return (
-            'SglRealQuantLinear('
-            + f'in_features={self.in_features}, '
-            + f'out_features={self.out_features}, '
-            + f'bias={self.bias is not None}, '
-            + f'weight_shape={self.weight_shape}, '
-            + f'weight_dtype={self.weight_dtype}, '
-            + f'scales_shape={self.scales_shape}, '
-            + f'scales_dtype={self.scales_dtype}, '
-            + f'zeros_shape={self.zeros_shape}, '
-            + f'zeros_dtype={self.zeros_dtype})'
-        )
-
-
-class AutoawqRealQuantLinear(nn.Module):
+class RealQuantLinear(nn.Module):
     def __init__(self, weight, bias, scales, zeros):
         super().__init__()
-        self.register_buffer('qweight', weight)
-
-        (
-            self.register_buffer('bias', bias)
-            if bias is not None
-            else setattr(self, 'bias', None)
-        )
-
-        self.register_buffer('scales', scales)
+        self.register_buffer("weight", weight)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        self.register_buffer("scales", scales)
 
-        (
-            self.register_buffer('qzeros', zeros)
-            if zeros is not None
-            else setattr(self, 'qzeros', None)
-        )
+        if zeros is not None:
+            self.register_buffer("zeros", zeros)
+        else:
+            self.zero = None
 
     @torch.no_grad()
     def forward(self, x):
@@ -674,166 +595,98 @@ def new(cls, module, w_q, quant_config):
     @torch.no_grad()
     def quant_pack(cls, module, w_q, quant_config):
         weight, scales, zeros = w_q(module)
-        pack_version = quant_config['weight']['pack_version']
-        if pack_version == 'gemm_pack':
-            int_weight, scales, int_zeros = \
-                cls.gemm_pack(weight, scales, zeros, quant_config)
-        elif pack_version == 'gemv_pack':
-            int_weight, scales, int_zeros = \
-                cls.gemv_pack(module, weight, scales, zeros, quant_config)
-        return int_weight, scales, int_zeros
+        weight, scales, zeros = cls.pack(weight, scales, zeros, quant_config)
+        return weight, scales, zeros
 
     @classmethod
     @torch.no_grad()
-    def gemm_pack(self, weight, scales, zeros, quant_config):
-
-        if zeros is not None:
-            zeros = zeros.t().contiguous()
-        scales = scales.t().contiguous()
-        weight = weight.t().contiguous()
-
-        bit = quant_config['weight']['bit']
-        pack_num = 32 // bit
-
-        int_weight = torch.zeros(
-            (weight.shape[0], weight.shape[1] // 32 * bit),
-            dtype=torch.int32,
-            device=weight.device,
-        )
-
-        for col in range(weight.shape[1] // pack_num):
-            if bit == 4:
-                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-            else:
-                raise NotImplementedError('Only 4-bit are supported for now.')
-            for i in range(pack_num):
-                int_weight_col = weight[:, col * pack_num + order_map[i]]
-                int_weight[:, col] |= int_weight_col << (i * bit)
-
-        if zeros is not None:
-            int_zeros = torch.zeros(
-                (zeros.shape[0], zeros.shape[1] // 32 * bit),
-                dtype=torch.int32,
-                device=zeros.device,
-            )
-
-            for col in range(zeros.shape[1] // pack_num):
-                if bit == 4:
-                    order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-                else:
-                    raise NotImplementedError('Only 4-bit are supported for now.')
-                for i in range(pack_num):
-                    intzero_col = zeros[:, col * pack_num + order_map[i]]
-                    int_zeros[:, col] |= intzero_col << (i * bit)
+    def pack(self, weight, scales, zeros, quant_config):
+        if quant_config["weight"]["bit"] == 8:
+            if zeros is not None:
+                zeros = zeros.view(weight.shape[0], -1)
+            scales = scales.view(weight.shape[0], -1)
+            return weight, scales, zeros
+
+        h1, h2 = weight.shape
+        # pack 8 int4 in an int32 number, pack 16 int2 in an int32 number.
+        bit = quant_config["weight"]["bit"]
+        tmp = 32 // bit
+
+        if (
+            quant_config["weight"]["group_size"] != -1
+            and quant_config["weight"]["granularity"] == "per_group"
+        ):
+            group_size = quant_config["weight"]["group_size"]
         else:
-            int_zeros = None
+            group_size = h2
 
-        return int_weight, scales, int_zeros
+        assert h1 % tmp == 0 and h2 % tmp == 0, "H1 {} H2 {}".format(h1, h2)
+        assert h2 % group_size == 0, "H1 {} H2 {}".format(h1, h2)
 
-    @classmethod
-    @torch.no_grad()
-    def gemv_pack(self, module, weight, scales, zeros, quant_config):
-
-        bit = quant_config['weight']['bit']
-        group_size = quant_config['weight']['group_size']
-        pack_num = 32 // bit
-
-        q_scales = torch.zeros(
-            (
-                scales.shape[0],
-                calculate_zeros_width(module.in_features, group_size) * pack_num,
-            ),
-            dtype=torch.float16,
-            device=scales.device,
-        )
-        q_scales[:, : scales.shape[1]] = scales
-
-        int_weight = torch.zeros(
-            (weight.shape[0], weight.shape[1] // 32 * bit),
-            dtype=torch.int32,
-            device=weight.device,
-        )
-
-        for col in range(weight.shape[1] // pack_num):
-            if bit == 4:
-                order_map = [0, 1, 2, 3, 4, 5, 6, 7]
-            else:
-                raise NotImplementedError('Only 4-bit are supported for now.')
-            for i in range(pack_num):
-                int_weight_col = weight[:, col * pack_num + order_map[i]]
-                int_weight[:, col] |= int_weight_col << (i * bit)
+        weight = weight.cuda()
+        int_weight = torch.empty(h1, h2 // tmp).to(torch.int32).cuda()
+        # Weight pack in row.
+        for pack in range(0, h2, tmp):
+            for i in range(tmp):
+                int_weight[:, pack // tmp] += weight[:, pack + i] << (i * bit)
+        weight = weight.cpu()
+        int_weight = int_weight.cpu()
+        del weight
 
         if zeros is not None:
-            int_zeros = torch.zeros(
-                (zeros.shape[0], calculate_zeros_width(module.in_features, group_size)),
-                dtype=torch.int32,
-                device=zeros.device,
-            )
-
-            for col in range(zeros.shape[1] // pack_num):
-                if bit == 4:
-                    order_map = [0, 1, 2, 3, 4, 5, 6, 7]
-                else:
-                    raise NotImplementedError('Only 4-bit are supported for now.')
-                for i in range(pack_num):
-                    if col * pack_num + order_map[i] >= zeros.shape[1]:
-                        continue
-                    int_zero_col = zeros[:, col * pack_num + order_map[i]]
-                    int_zeros[:, col] |= int_zero_col << (i * bit)
+            zeros = zeros.cuda()
+            int_zeros = torch.zeros(h1 // tmp, h2 // group_size).to(torch.int32).cuda()
+            zeros = zeros.view(h1, -1)
+            # zero point pack in col.
+            for pack in range(0, h1, tmp):
+                for i in range(tmp):
+                    int_zeros[pack // tmp, :] += zeros[pack + i, :] << (i * bit)
+            zeros = zeros.cpu()
+            int_zeros = int_zeros.cpu()
+            del zeros
         else:
             int_zeros = None
 
-        return int_weight, q_scales, int_zeros
+        gc.collect()
+        torch.cuda.empty_cache()
 
-    def __repr__(self):
-        return (
-            'AutoawqRealQuantLinear('
-            + f'in_features={self.in_features}, '
-            + f'out_features={self.out_features}, '
-            + f'bias={self.bias is not None}, '
-            + f'weight_shape={self.weight_shape}, '
-            + f'weight_dtype={self.weight_dtype}, '
-            + f'scales_shape={self.scales_shape}, '
-            + f'scales_dtype={self.scales_dtype}, '
-            + f'zeros_shape={self.zeros_shape}, '
-            + f'zeros_dtype={self.zeros_dtype})'
-        )
-
-
-class MlcllmRealQuantLinear(AutoawqRealQuantLinear):
-    def __init__(self, weight, bias, scales, zeros):
-        super().__init__(weight, bias, scales, zeros)
+        scales = scales.view(h1, -1)
+        return int_weight, scales, int_zeros
 
     def __repr__(self):
         return (
-            'MlcllmRealQuantLinear('
-            + f'in_features={self.in_features}, '
-            + f'out_features={self.out_features}, '
-            + f'bias={self.bias is not None}, '
-            + f'weight_shape={self.weight_shape}, '
-            + f'weight_dtype={self.weight_dtype}, '
-            + f'scales_shape={self.scales_shape}, '
-            + f'scales_dtype={self.scales_dtype}, '
-            + f'zeros_shape={self.zeros_shape}, '
-            + f'zeros_dtype={self.zeros_dtype})'
+            "RealQuantLinear("
+            + f"in_features={self.in_features}, "
+            + f"out_features={self.out_features}, "
+            + f"bias={self.bias is not None}, "
+            + f"weight_shape={self.weight_shape}, "
+            + f"weight_dtype={self.weight_dtype}, "
+            + f"scales_shape={self.scales_shape}, "
+            + f"scales_dtype={self.scales_dtype}, "
+            + f"zeros_shape={self.zeros_shape}, "
+            + f"zeros_dtype={self.zeros_dtype})"
         )
 
 
-_TRANSFORMERS_LN_TYPES_ = ALL_LAYERNORM_LAYERS
+_TRANSFORMERS_LN_TYPES_ = ALL_LAYERNORM_LAYERS + [
+    MistralRMSNorm,
+    MixtralRMSNorm,
+    Qwen2RMSNorm,
+    LlamaRMSNorm,
+    nn.LayerNorm,
+]
 _TRANSFORMERS_LINEAR_TYPES_ = [nn.Linear]
 
 _MODEL_LN_TYPES_PAIRS_ = {
-    'Llama': LlmcLlamaRMSNorm,
-    'Llava': LlmcLlamaRMSNorm,
-    'Mistral': LlmcMistralRMSNorm,
-    'Mixtral': LlmcMixtralRMSNorm,
-    'Interlm2': LlmcInternLM2RMSNorm,
-    'Qwen2': LlmcQwen2RMSNorm,
-    'Gemma2': LlmcGemma2RMSNorm,
-    'MiniCPM': LlmcMiniCPMRMSNorm,
-    'Starcoder': LlmcLayerNorm,
-    'Opt': LlmcLayerNorm,
-    'Bloom': LlmcLayerNorm,
+    "Llama": LlmcLlamaRMSNorm,
+    "Llava": LlmcLlamaRMSNorm,
+    "Mistral": LlmcMistralRMSNorm,
+    "Mixtral": LlmcMixtralRMSNorm,
+    "Interlm2": LlmcInternLM2RMSNorm,
+    "Qwen2": LlmcQwen2RMSNorm,
+    "Starcoder": LlmcLayerNorm,
+    "Opt": LlmcLayerNorm,
+    "Bloom": LlmcLayerNorm,
 }
 
 
@@ -845,8 +698,6 @@ def __repr__(self):
     LlmcMistralRMSNorm,
     LlmcMixtralRMSNorm,
     LlmcInternLM2RMSNorm,
-    LlmcGemma2RMSNorm,
-    LlmcMiniCPMRMSNorm,
 ]
 
 
@@ -855,16 +706,5 @@ def __repr__(self):
     RotateLinear,
     FakeQuantLinear,
     EffcientFakeQuantLinear,
-    VllmRealQuantLinear,
-    SglRealQuantLinear,
-    AutoawqRealQuantLinear,
-    MlcllmRealQuantLinear
+    RealQuantLinear,
 ]
-
-
-_REALQUANT_LINEAR_MAP_ = {
-    'vllm_quant': VllmRealQuantLinear,
-    'sgl_quant': SglRealQuantLinear,
-    'autoawq_quant': AutoawqRealQuantLinear,
-    'mlcllm_quant': MlcllmRealQuantLinear
-}
diff --git a/llmc/compression/quantization/ntweak.py b/llmc/compression/quantization/ntweak.py
index b758bf2a8..022768a3d 100644
--- a/llmc/compression/quantization/ntweak.py
+++ b/llmc/compression/quantization/ntweak.py
@@ -1,6 +1,7 @@
 import functools
 import gc
 import math
+import pdb
 from contextlib import nullcontext
 from math import inf
 
@@ -19,8 +20,8 @@
 
 @ALGO_REGISTRY
 class NormTweaking(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.add_quant_config()
 
         model_type = self.config['model']['type']
@@ -139,6 +140,7 @@ def ntweak_train(self, block):
 
                 if not math.isfinite(loss.item()):
                     logger.info('Loss is NAN, stopping training')
+                    pdb.set_trace()
 
                 loss_list.append(loss.data)
                 optimizer.zero_grad()
diff --git a/llmc/compression/quantization/omniq.py b/llmc/compression/quantization/omniq.py
index 8c5ff7c33..bddab35b3 100644
--- a/llmc/compression/quantization/omniq.py
+++ b/llmc/compression/quantization/omniq.py
@@ -2,6 +2,7 @@
 import functools
 import gc
 import math
+import pdb
 import random
 from contextlib import nullcontext
 from math import inf
@@ -24,8 +25,8 @@
 
 @ALGO_REGISTRY
 class OmniQuant(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.add_quant_config()
 
         model_type = self.config['model']['type']
@@ -213,6 +214,7 @@ def omni_train(self, block):
 
                 if not math.isfinite(loss.item()):
                     logger.info('Loss is NAN, stopping training')
+                    pdb.set_trace()
 
                 loss_list.append(loss.data)
                 optimizer.zero_grad()
@@ -305,7 +307,7 @@ def register_lwc_parameters(self, block, input_feat, init_value=4.0):
                             torch.ones(
                                 (dim, 1),
                                 device=self.dev,
-                                dtype=self.dtype,
+                                # dtype=self.dtype,
                             )
                             * init_value
                         )
@@ -313,7 +315,7 @@ def register_lwc_parameters(self, block, input_feat, init_value=4.0):
                         torch.ones(
                             (dim, 1),
                             device=self.dev,
-                            dtype=self.dtype,
+                            # dtype=self.dtype,
                         )
                         * init_value
                     )
@@ -383,13 +385,12 @@ def get_clip_parameters(self, input_feat, n, m):
                 inputs = input_feat[n]
 
             max_val, min_val = self.auto_clip_layer(
-                n,
                 m.weight.data,
                 inputs,
-                n_sample_token=self.config.calib.get('seq_len', None),
+                n_sample_token=self.config.calib.seq_len,
             )
 
-            up_factor, low_factor = self.get_clip_factor(m, min_val, max_val, n)
+            up_factor, low_factor = self.get_clip_factor(m, min_val, max_val)
 
         up_param = nn.Parameter(up_factor)
         low_param = nn.Parameter(low_factor)
diff --git a/llmc/compression/quantization/osplus.py b/llmc/compression/quantization/osplus.py
index a433c0c23..d95caf9d2 100644
--- a/llmc/compression/quantization/osplus.py
+++ b/llmc/compression/quantization/osplus.py
@@ -17,9 +17,9 @@
 
 @ALGO_REGISTRY
 class OsPlus(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
+    def __init__(self, model, quant_config, input, config):
         torch.set_grad_enabled(False)
-        super().__init__(model, quant_config, input, padding_mask, config)
+        super().__init__(model, quant_config, input, config)
 
         special_config = self.quant_config.get('special', {})
         self.weight_clip = special_config.get('weight_clip', False)
@@ -106,7 +106,7 @@ def register_hooks(feat_dict):
             self.auto_clip(
                 block,
                 clip_input_feat,
-                n_sample_token=self.config.calib.get('seq_len', None),
+                n_sample_token=self.config.calib.seq_len,
                 eps=3e-1,
             )
 
diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py
index f49f204f0..77c27ece1 100644
--- a/llmc/compression/quantization/quant.py
+++ b/llmc/compression/quantization/quant.py
@@ -3,30 +3,62 @@
 from torch import nn
 
 
-class BaseQuantizer(object):
+class Quantizer:
     def __init__(self, bit, symmetric, granularity, **kwargs):
-        self.bit = bit
+        if isinstance(bit, str):
+            # for fp quantization, format: ExMy
+            self.use_fp = True
+            self.e_bits = int(bit[1])
+            self.m_bits = int(bit[-1])
+            self.sign_bits = 1
+            self.bit = self.e_bits + self.m_bits + self.sign_bits
+            self.default_bias = 2 ** (self.e_bits - 1)
+        else:
+            self.use_fp = False
+            self.bit = bit
         self.sym = symmetric
         self.granularity = granularity
         self.kwargs = kwargs
 
-        self.calib_algo = self.kwargs.get('calib_algo', 'minmax')
+        if 'calib_algo' in self.kwargs:
+            self.calib_algo = self.kwargs['calib_algo']
+        else:
+            self.calib_algo = 'minmax'
+
+        if 'qmax_to_tensor' in self.kwargs and self.kwargs['qmax_to_tensor']:
+            if self.sym:
+                self.max_int = torch.tensor(2 ** (self.bit - 1) - 1).cuda()
+                self.min_int = torch.tensor(-(2 ** (self.bit - 1))).cuda()
+            else:
+                self.max_int = torch.tensor(2**self.bit - 1).cuda()
+                self.min_int = torch.tensor(0.0).cuda()
+        else:
+            if self.sym:
+                self.max_int = 2 ** (self.bit - 1) - 1
+                self.min_int = -(2 ** (self.bit - 1))
+            else:
+                self.max_int = 2**self.bit - 1
+                self.min_int = 0.0
 
         if self.granularity == 'per_group':
             self.group_size = self.kwargs['group_size']
         elif self.granularity == 'per_head':
             self.head_num = self.kwargs['head_num']
 
-        self.mse_b_num = self.kwargs.get('mse_b_num', 1)
-
-        if self.kwargs.get('ste', False):
+        if 'ste' in self.kwargs and self.kwargs['ste']:
             self.round_func = lambda x: (x.round() - x).detach() + x
         else:
             self.round_func = torch.round
 
-        self.round_zp = self.kwargs.get('round_zp', True)
+        self.round_zp = 'round_zp' not in self.kwargs or self.kwargs['round_zp']
         self.sigmoid = torch.nn.Sigmoid()
 
+    def __repr__(self):
+        return (
+            f'Quantizer(bit={self.bit}, sym={self.sym}, granularity={self.granularity},'
+            f'kwargs={self.kwargs}, max_int={self.max_int}, min_int={self.min_int})'
+        )
+
     def get_tensor_range(self, tensor, args={}):
         if self.calib_algo == 'minmax':
             return self.get_minmax_range(tensor)
@@ -35,7 +67,7 @@ def get_tensor_range(self, tensor, args={}):
         elif self.calib_algo == 'learnable':
             return self.get_learnable_range(tensor, **args)
         else:
-            raise ValueError(f'Unsupported calibration algorithm: {self.calib_algo}')
+            logger.info('Calibration Algorithm Not Found!')
 
     def get_minmax_range(self, tensor):
         if self.granularity == 'per_tensor':
@@ -47,21 +79,20 @@ def get_minmax_range(self, tensor):
 
         return (min_val, max_val)
 
-    def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256):
-
-        assert self.mse_b_num >= 1 and tensor.shape[0] % self.mse_b_num == 0, \
-            'Batch number must be divisible by tensor.shape[0],'
-        bs = tensor.shape[0] // self.mse_b_num
+    @torch.no_grad()
+    def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=1024):
+        if tensor.shape[0] % bs != 0:
+            bs = tensor.shape[0]
         tensor = tensor.float()
         min_val, max_val = self.get_minmax_range(tensor)
 
         dev = tensor.device
 
-        for b_num in range(self.mse_b_num):
-            _tensor = tensor[b_num * bs: (b_num + 1) * bs, :]
+        for b_num in range(tensor.shape[0] // bs):
+            _tensor = tensor[b_num * bs : (b_num + 1) * bs, :]
             _min_val, _max_val = (
-                min_val[b_num * bs: (b_num + 1) * bs, :],
-                max_val[b_num * bs: (b_num + 1) * bs, :],
+                min_val[b_num * bs : (b_num + 1) * bs, :],
+                max_val[b_num * bs : (b_num + 1) * bs, :],
             )
 
             best = torch.full([_tensor.shape[0]], float('inf'), device=dev)
@@ -74,22 +105,18 @@ def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256):
                 xmin = p * _min_val
                 xmax = p * _max_val
 
-                if self.quant_type == 'float-quant' and not self.use_qtorch:
-                    clip_tensor, scales = self.get_float_qparams(
-                        _tensor, (xmin, xmax), dev
+                if not self.use_fp:
+                    scales, zeros, max_int, min_int = self.get_qparams(
+                        (xmin, xmax), dev
                     )
-                    zeros, qmin, qmax = 0, None, None
                     q_tensor = self.quant_dequant(
-                        clip_tensor, scales, zeros, qmax, qmin
+                        _tensor, scales, zeros, max_int, min_int
                     )
-
                 else:
-                    scales, zeros, qmax, qmin = self.get_qparams(
-                        (xmin, xmax), dev
-                    )
-                    q_tensor = self.quant_dequant(
-                        _tensor, scales, zeros, qmax, qmin
+                    clip_tensor, scales = self.get_fp_qparams(
+                        _tensor, (xmin, xmax), dev
                     )
+                    q_tensor = self.fp_quant_dequant(clip_tensor, scales)
 
                 q_tensor -= _tensor
                 q_tensor.abs_()
@@ -104,8 +131,8 @@ def get_mse_range(self, tensor, grid=100, norm=2.4, maxshrink=0.8, bs=256):
                     best_max_val[tmp] = xmax[tmp]
 
             (
-                min_val[b_num * bs: (b_num + 1) * bs, :],
-                max_val[b_num * bs: (b_num + 1) * bs, :],
+                min_val[b_num * bs : (b_num + 1) * bs, :],
+                max_val[b_num * bs : (b_num + 1) * bs, :],
             ) = (best_min_val, best_max_val)
 
         return (min_val, max_val)
@@ -128,94 +155,68 @@ def get_learnable_range(self, tensor, lowbound_factor=None, upbound_factor=None)
 
     def get_qparams(self, tensor_range, device):
         min_val, max_val = tensor_range[0], tensor_range[1]
-        qmin = self.qmin
-        qmax = self.qmax
+        max_int = self.max_int
+        min_int = self.min_int
         if self.sym:
             abs_max = torch.max(max_val.abs(), min_val.abs())
             abs_max = abs_max.clamp(min=1e-5)
-            scales = abs_max / qmax
+            scales = abs_max / max_int
             zeros = torch.tensor(0.0)
         else:
-            scales = (max_val - min_val).clamp(min=1e-5) / (qmax - qmin)
-            zeros = (qmin - torch.round(min_val / scales)).clamp(qmin, qmax)
+            scales = (max_val - min_val).clamp(min=1e-5) / max_int
+            zeros = (-torch.round(min_val / scales)).clamp(min_int, max_int)
             if not self.round_zp:
-                zeros = qmin - (min_val / scales)
-        return scales, zeros, qmax, qmin
+                zeros = -min_val / scales
+        return scales, zeros, max_int, min_int
 
     def get_tensor_qparams(self, tensor, args={}):
         tensor = self.reshape_tensor(tensor)
         tensor_range = self.get_tensor_range(tensor, args)
-        scales, zeros, qmax, qmin = self.get_qparams(tensor_range, tensor.device)
-        return tensor, scales, zeros, qmax, qmin
+        scales, zeros, max_int, min_int = self.get_qparams(tensor_range, tensor.device)
+        return tensor, scales, zeros, max_int, min_int
 
-    def reshape_tensor(self, tensor, allow_padding=False):
-        if self.granularity == 'per_group':
-            if tensor.shape[1] >= self.group_size:
-                if tensor.shape[1] % self.group_size == 0:
-                    t = tensor.reshape(-1, self.group_size)
-                elif allow_padding:
-                    deficiency = self.group_size - tensor.shape[1] % self.group_size
-                    prefix = tensor.shape[:-1]
-                    pad_zeros = torch.zeros(
-                        (*prefix, deficiency),
-                        device=tensor.device, dtype=tensor.dtype)
-                    t = torch.cat(
-                        (tensor, pad_zeros),
-                        dim=-1).reshape(-1, self.group_size)
-                else:
-                    raise ValueError(
-                        f'Dimension {tensor.shape[-1]} '
-                        f'not divisible by group size {self.group_size}'
-                    )
-            else:
-                t = tensor
-        elif self.granularity == 'per_head':
-            t = tensor.reshape(self.head_num, -1)
-        else:
-            t = tensor
-        return t
+    def get_fp_tensor_qparams(self, tensor, args={}):
+        tensor = self.reshape_tensor(tensor)
+        tensor_range = self.get_tensor_range(tensor, args)
+        clip_tensor, scales = self.get_fp_qparams(tensor, tensor_range, tensor.device)
+        return clip_tensor, scales
 
-    def restore_tensor(self, tensor, shape):
-        if tensor.shape == shape:
-            t = tensor
-        else:
-            try:
-                t = tensor.reshape(shape)
-            except RuntimeError:
-                deficiency = self.group_size - shape[1] % self.group_size
-                t = tensor.reshape(*shape[:-1], -1)[..., :-deficiency]
-        return t
+    def get_fp_qparams(self, tensor, tensor_range, device):
+        min_val, max_val = tensor_range[0], tensor_range[1]
+        maxval = torch.max(max_val, -min_val)
 
+        e_bits = torch.tensor(self.e_bits, dtype=torch.float32).cuda()
+        m_bits = torch.tensor(self.m_bits, dtype=torch.float32).cuda()
 
-class IntegerQuantizer(BaseQuantizer):
-    def __init__(self, bit, symmetric, granularity, **kwargs):
-        super().__init__(bit, symmetric, granularity, **kwargs)
-        self.quant_type = 'int-quant'
-        if 'int_range' in self.kwargs:
-            self.qmin = self.kwargs['int_range'][0]
-            self.qmax = self.kwargs['int_range'][1]
-        else:
-            if self.sym:
-                self.qmin = -(2 ** (self.bit - 1))
-                self.qmax = 2 ** (self.bit - 1) - 1
-            else:
-                self.qmin = 0.0
-                self.qmax = 2**self.bit - 1
+        if maxval.shape[0] != 1 and len(maxval.shape) != len(tensor.shape):
+            maxval = maxval.view([-1] + [1] * (len(tensor.shape) - 1))
 
-        if self.kwargs.get('qmax_to_tensor'):
-            self.qmin = torch.tensor(self.qmin).cuda()
-            self.qmax = torch.tensor(self.qmax).cuda()
+        if e_bits >= 5:
+            maxval = maxval.to(dtype=torch.float32)
+
+        bias = 2**e_bits - torch.log2(maxval) + torch.log2(2 - 2 ** (-m_bits)) - 1
+
+        xc = torch.min(torch.max(tensor, -maxval), maxval)
 
-    def quant(self, tensor, scales, zeros, qmax, qmin):
+        log_scales = torch.clamp(
+            (torch.floor(torch.log2(torch.abs(xc)) + bias)).detach(), 1.0
+        )
+
+        scales = 2.0 ** (log_scales - m_bits - bias)
+
+        return xc, scales
+
+    def quant(self, tensor, scales, zeros, max_int, min_int):
         if self.round_zp:
             tensor = torch.clamp(
-                self.round_func(tensor / scales) + zeros, qmin, qmax
+                self.round_func(tensor / scales) + zeros, min_int, max_int
             )
         else:
+
             tensor = torch.clamp(
                 self.round_func(tensor / scales.clamp_min(1e-9) + zeros),
-                qmin,
-                qmax,
+                min_int,
+                max_int,
             )
         return tensor
 
@@ -223,11 +224,34 @@ def dequant(self, tensor, scales, zeros):
         tensor = (tensor - zeros) * scales
         return tensor
 
-    def quant_dequant(self, tensor, scales, zeros, qmax, qmin):
-        tensor = self.quant(tensor, scales, zeros, qmax, qmin)
+    def fp_quant_dequant(self, tensor, scales):
+        tensor = self.round_func(tensor / scales) * scales
+        return tensor
+
+    def quant_dequant(self, tensor, scales, zeros, max_int, min_int):
+        tensor = self.quant(tensor, scales, zeros, max_int, min_int)
         tensor = self.dequant(tensor, scales, zeros)
         return tensor
 
+    def reshape_tensor(self, tensor):
+        if self.granularity == 'per_group':
+            if tensor.shape[1] >= self.group_size:
+                t = tensor.reshape(-1, self.group_size)
+            else:
+                t = tensor
+        elif self.granularity == 'per_head':
+            t = tensor.reshape(self.head_num, -1)
+        else:
+            t = tensor
+        return t
+
+    def restore_tensor(self, tensor, shape):
+        if tensor.shape == shape:
+            t = tensor
+        else:
+            t = tensor.reshape(shape)
+        return t
+
     def fake_quant_act_static(self, act, args={}):
         if 'int_indices' in args:
             q_act = act[:, :, args['int_indices']]
@@ -242,14 +266,14 @@ def fake_quant_act_static(self, act, args={}):
         org_act_shape = q_act.shape
         org_act_dtype = q_act.dtype
 
-        scales, zeros, qmax, qmin = (
+        scales, zeros, max_int, min_int = (
             args['scales'],
             args['zeros'],
-            args['qmax'],
-            args['qmin'],
+            args['max_int'],
+            args['min_int'],
         )
         q_act = self.reshape_tensor(q_act)
-        q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin)
+        q_act = self.quant_dequant(q_act, scales, zeros, max_int, min_int)
         q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype)
 
         if 'current_bit' in args:
@@ -263,6 +287,7 @@ def fake_quant_act_static(self, act, args={}):
 
         return q_act
 
+    # support mix precision quant act
     def fake_quant_act_dynamic(self, act, args={}):
         if 'int_indices' in args:
             q_act = act[:, :, args['int_indices']]
@@ -277,10 +302,14 @@ def fake_quant_act_dynamic(self, act, args={}):
         org_act_shape = q_act.shape
         org_act_dtype = q_act.dtype
 
-        q_act, scales, zeros, qmax, qmin = self.get_tensor_qparams(
-            q_act, args
-        )
-        q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin)
+        if not self.use_fp:
+            q_act, scales, zeros, max_int, min_int = self.get_tensor_qparams(
+                q_act, args
+            )
+            q_act = self.quant_dequant(q_act, scales, zeros, max_int, min_int)
+        else:
+            q_act, scales = self.get_fp_tensor_qparams(q_act, args)
+            q_act = self.fp_quant_dequant(q_act, scales)
 
         q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype)
 
@@ -309,14 +338,14 @@ def fake_quant_weight_static(self, weight, args):
 
         org_w_shape = q_weight.shape
         org_w_dtype = q_weight.dtype
-        scales, zeros, qmax, qmin = (
+        scales, zeros, max_int, min_int = (
             args['scales'],
             args['zeros'],
-            args['qmax'],
-            args['qmin'],
+            args['max_int'],
+            args['min_int'],
         )
         q_weight = self.reshape_tensor(q_weight)
-        q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin)
+        q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int)
         q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype)
 
         if 'int_indices' in args:
@@ -330,6 +359,7 @@ def fake_quant_weight_static(self, weight, args):
 
         return q_weight
 
+    # support mix precision quant weight
     def fake_quant_weight_dynamic(self, weight, args={}):
         if 'int_indices' in args:
             if self.granularity == 'per_group':
@@ -348,11 +378,15 @@ def fake_quant_weight_dynamic(self, weight, args={}):
 
         org_w_shape = q_weight.shape
         org_w_dtype = q_weight.dtype
+        if not self.use_fp:
+            q_weight, scales, zeros, max_int, min_int = self.get_tensor_qparams(
+                q_weight, args
+            )
 
-        q_weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(
-            q_weight, args
-        )
-        q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin)
+            q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int)
+        else:
+            q_weight, scales = self.get_fp_tensor_qparams(q_weight, args)
+            q_weight = self.fp_quant_dequant(q_weight, scales)
 
         q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype)
 
@@ -372,282 +406,48 @@ def fake_quant_weight_dynamic(self, weight, args={}):
 
     def real_quant_weight_static(self, weight, args):
         org_w_shape = weight.shape
-        scales, zeros, qmax, qmin = (
+        scales, zeros, max_int, min_int = (
             args['scales'],
             args['zeros'],
-            args['qmax'],
-            args['qmin'],
+            args['max_int'],
+            args['min_int'],
         )
         weight = self.reshape_tensor(weight)
-        weight = self.quant(weight, scales, zeros, qmax, qmin)
+        weight = self.quant(weight, scales, zeros, max_int, min_int)
         weight = self.restore_tensor(weight, org_w_shape)
 
         if self.bit == 8:
-            if self.qmin != 0:
+            if self.sym:
                 dtype = torch.int8
             else:
                 dtype = torch.uint8
         else:
             dtype = torch.int32
         weight = weight.to(dtype)
-        if not self.sym and self.round_zp:
+        if zeros != torch.tensor(0.0) and self.round_zp:
             zeros = zeros.to(dtype)
-        elif self.sym:
+        else:
             zeros = None
 
-        if zeros is not None:
-            zeros = zeros.view(weight.shape[0], -1)
-        scales = scales.view(weight.shape[0], -1)
-
         return weight, scales, zeros
 
     def real_quant_weight_dynamic(self, weight, args={}):
         org_w_shape = weight.shape
-        weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(weight, args)
-        weight = self.quant(weight, scales, zeros, qmax, qmin)
+        weight, scales, zeros, max_int, min_int = self.get_tensor_qparams(weight, args)
+        weight = self.quant(weight, scales, zeros, max_int, min_int)
         weight = self.restore_tensor(weight, org_w_shape)
 
         if self.bit == 8:
-            if self.qmin != 0:
+            if self.sym:
                 dtype = torch.int8
             else:
                 dtype = torch.uint8
         else:
             dtype = torch.int32
         weight = weight.to(dtype)
-        if not self.sym and self.round_zp:
+        if zeros != torch.tensor(0.0) and self.round_zp:
             zeros = zeros.to(dtype)
-        elif self.sym:
-            zeros = None
-
-        if zeros is not None:
-            zeros = zeros.view(weight.shape[0], -1)
-        scales = scales.view(weight.shape[0], -1)
-
-        return weight, scales, zeros
-
-    def __repr__(self):
-        return (
-            f'IntegerQuantizer(bit={self.bit}, sym={self.sym},'
-            f'granularity={self.granularity},'
-            f'kwargs={self.kwargs}, qmin={self.qmin}, qmax={self.qmax})'
-        )
-
-
-class FloatQuantizer(BaseQuantizer):
-    def __init__(self, bit, symmetric, granularity, **kwargs):
-        super().__init__(bit, symmetric, granularity, **kwargs)
-        self.sym = True
-        self.quant_type = 'float-quant'
-        self.e_bits = int(self.bit[1])
-        self.m_bits = int(self.bit[-1])
-        self.sign_bits = 1
-        self.num_bits = self.e_bits + self.m_bits + self.sign_bits
-        self.default_bias = 2 ** (self.e_bits - 1)
-
-        self.use_qtorch = self.kwargs.get('use_qtorch')
-        if self.use_qtorch:
-            try:
-                from qtorch.quant import float_quantize
-            except ImportError:
-                logger.error('qtorch not found, please install qtorch.')
-                raise ImportError('Please install qtorch (pip install qtorch).')
-
-            self.float_quantize = float_quantize
-
-            if 'float_range' in self.kwargs:
-                self.qmin, self.qmax = self.kwargs['float_range']
-            else:
-                bit_ranges = {
-                    ('e4m3', 8): torch.float8_e4m3fn,
-                    ('e5m2', 8): torch.float8_e5m2,
-                    ('e3m2', 6): (-28, 28),
-                    ('e4m7', 12): (-510, 510),
-                    ('e2m1', 4): (-6, 6),
-                }
-
-                key = (self.bit, self.num_bits)
-                if key in bit_ranges:
-                    if isinstance(bit_ranges[key], tuple):
-                        self.qmin, self.qmax = bit_ranges[key]
-                    else:
-                        finfo = torch.finfo(bit_ranges[key])
-                        self.qmin, self.qmax = finfo.min, finfo.max
-                else:
-                    raise NotImplementedError('Only 4, 6, 8, and \
-                                                12-bit quantization is supported.')
-
-    def get_float_qparams(self, tensor, tensor_range, device):
-        min_val, max_val = tensor_range[0], tensor_range[1]
-        maxval = torch.max(max_val, -min_val)
-
-        e_bits = torch.tensor(self.e_bits, dtype=torch.float32).cuda()
-        m_bits = torch.tensor(self.m_bits, dtype=torch.float32).cuda()
-
-        if maxval.shape[0] != 1 and len(maxval.shape) != len(tensor.shape):
-            maxval = maxval.view([-1] + [1] * (len(tensor.shape) - 1))
-
-        if e_bits >= 5:
-            maxval = maxval.to(dtype=torch.float32)
-
-        bias = 2**e_bits - torch.log2(maxval) + torch.log2(2 - 2 ** (-m_bits)) - 1
-
-        xc = torch.min(torch.max(tensor, -maxval), maxval)
-
-        log_scales = torch.clamp(
-            (torch.floor(torch.log2(torch.abs(xc)) + bias)).detach(), 1.0
-        )
-        scales = 2.0 ** (log_scales - m_bits - bias)
-
-        return xc, scales
-
-    def get_tensor_qparams(self, tensor, args={}):
-        tensor = self.reshape_tensor(tensor)
-        tensor_range = self.get_tensor_range(tensor, args)
-        if self.use_qtorch:
-            scales, zeros, qmax, qmin = self.get_qparams(tensor_range, tensor.device)
         else:
-            tensor, scales = self.get_float_qparams(tensor, tensor_range, tensor.device)
-            zeros, qmin, qmax = torch.tensor(0), None, None
-
-        return tensor, scales, zeros, qmax, qmin
-
-    def quant(self, tensor, scales, zeros, qmax, qmin):
-        scales[scales == 0] = 1
-        scaled_tensor = tensor / scales + zeros
-        if self.use_qtorch:
-            org_dtype = scaled_tensor.dtype
-            q_tensor = self.float_quantize(scaled_tensor.float(),
-                                           self.e_bits,
-                                           self.m_bits,
-                                           rounding='nearest')
-            q_tensor.to(org_dtype)
-        else:
-            q_tensor = self.round_func(scaled_tensor)
-        return q_tensor
-
-    def dequant(self, tensor, scales, zeros):
-        tensor = (tensor - zeros) * scales
-        return tensor
-
-    def quant_dequant(self, tensor, scales, zeros, qmax, qmin):
-        tensor = self.quant(tensor, scales, zeros, qmax, qmin)
-        tensor = self.dequant(tensor, scales, zeros)
-        return tensor
-
-    def fake_quant_act_static(self, act, args={}):
-        q_act = act
-        org_act_shape = q_act.shape
-        org_act_dtype = q_act.dtype
-
-        scales, zeros, qmax, qmin = (
-            args['scales'],
-            args['zeros'],
-            args['qmax'],
-            args['qmin'],
-        )
-        q_act = self.reshape_tensor(q_act)
-        q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin)
-        q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype)
-
-        return q_act
-
-    def fake_quant_act_dynamic(self, act, args={}):
-        q_act = act
-        org_act_shape = q_act.shape
-        org_act_dtype = q_act.dtype
-
-        q_act, scales, zeros, qmax, qmin = self.get_tensor_qparams(
-            q_act, args
-        )
-        q_act = self.quant_dequant(q_act, scales, zeros, qmax, qmin)
-
-        q_act = self.restore_tensor(q_act, org_act_shape).to(org_act_dtype)
-        return q_act
-
-    def fake_quant_weight_static(self, weight, args):
-
-        if 'dim' in args and 'ic' in args['dim']:
-            q_weight = weight.T
-        else:
-            q_weight = weight
-
-        org_w_shape = q_weight.shape
-        org_w_dtype = q_weight.dtype
-        scales, zeros, qmax, qmin = (
-            args['scales'],
-            args['zeros'],
-            args['qmax'],
-            args['qmin'],
-        )
-        q_weight = self.reshape_tensor(q_weight)
-        q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin)
-        q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype)
-
-        if 'dim' in args and 'ic' in args['dim']:
-            q_weight = q_weight.T
-
-        return q_weight
-
-    def fake_quant_weight_dynamic(self, weight, args={}):
-
-        if 'dim' in args and 'ic' in args['dim']:
-            q_weight = weight.T
-        else:
-            q_weight = weight
-
-        org_w_shape = q_weight.shape
-        org_w_dtype = q_weight.dtype
-
-        q_weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(
-            q_weight, args
-        )
-        q_weight = self.quant_dequant(q_weight, scales, zeros, qmax, qmin)
-        q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype)
-
-        if 'dim' in args and 'ic' in args['dim']:
-            q_weight = q_weight.T
-
-        return q_weight
-
-    def real_quant_weight_static(self, weight, args):
-        assert self.bit in ['e4m3', 'e5m2'], 'Only FP8 E4M3 and E5M2 support real quant'
-        dtype = torch.float8_e4m3fn if self.e_bits == 4 else torch.float8_e5m2
-
-        org_w_shape = weight.shape
-        scales, zeros, qmax, qmin = (
-            args['scales'],
-            args['zeros'],
-            args['qmax'],
-            args['qmin'],
-        )
-        weight = self.reshape_tensor(weight)
-        weight = self.quant(weight, scales, zeros, qmax, qmin)
-        weight = self.restore_tensor(weight, org_w_shape)
-
-        weight = weight.to(dtype)
-        zeros = None
-        scales = scales.view(weight.shape[0], -1)
-        return weight, scales, zeros
-
-    def real_quant_weight_dynamic(self, weight, args={}):
-        assert self.bit in ['e4m3', 'e5m2'], 'Only FP8 E4M3 and E5M2 support real quant'
-        dtype = torch.float8_e4m3fn if self.e_bits == 4 else torch.float8_e5m2
-
-        org_w_shape = weight.shape
-        weight, scales, zeros, qmax, qmin = self.get_tensor_qparams(weight, args)
-        weight = self.quant(weight, scales, zeros, qmax, qmin)
-        weight = self.restore_tensor(weight, org_w_shape)
+            zeros = None
 
-        weight = weight.to(dtype)
-        zeros = None
-        scales = scales.view(weight.shape[0], -1)
         return weight, scales, zeros
-
-    def __repr__(self):
-        return (
-            f'FloatQuantizer(bit={self.bit},'
-            f'e_bits={self.e_bits}, m_bits={self.m_bits},'
-            f'granularity={self.granularity},'
-            f'kwargs={self.kwargs}, qmin={self.qmin}, qmax={self.qmax})'
-        )
diff --git a/llmc/compression/quantization/quarot.py b/llmc/compression/quantization/quarot.py
index 8e6c1d2df..fb4baaafd 100644
--- a/llmc/compression/quantization/quarot.py
+++ b/llmc/compression/quantization/quarot.py
@@ -1,6 +1,4 @@
 import gc
-import json
-import os
 
 import torch
 import torch.nn as nn
@@ -16,22 +14,15 @@
 
 @ALGO_REGISTRY
 class Quarot(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.dev = torch.device('cuda')
         self.add_quant_config()
         self.preprocess()
 
     def preprocess(self):
-        if torch.equal(
-            self.model.get_head_layers()[0].weight,
-            self.model.get_embed_layers()[0].weight,
-        ):
-            logger.info('Tie weight! Copy embed_layer for head_layer!')
-            del self.model.get_head_layers()[0].weight
-            w = self.model.get_embed_layers()[0].weight.clone()
-            self.model.get_head_layers()[0].weight = nn.Parameter(w)
-
+        assert self.config['model']['type'] in ['Opt', 'Llama']
+        # if self.config["model"]["type"] in ["Opt"]:
         self.remove_mean_from_embed()
 
         self.Q = self.get_orthogonal_matrix()
@@ -49,31 +40,30 @@ def preprocess(self):
         )
 
         self.rotate_head(self.Q)
+
         gc.collect()
         torch.cuda.empty_cache()
 
+    def a_rot(self, act, module, a_rotater):
+        return a_rotater.rotate(act)
+
     @torch.no_grad()
     def add_quant_config(self):
         self.rotate_mode = self.quant_config['special']['rotate_mode']
 
     def get_orthogonal_matrix(self):
         if self.rotate_mode == 'random':
-            try:
-                return random_orthogonal_matrix(self.hidden_size, self.dev)
-            except NameError:
-                raise RuntimeError(
-                    'Function random_orthogonal_matrix is not defined.'
-                )
+            return random_orthogonal_matrix(self.hidden_size, self.dev)
         elif self.rotate_mode == 'hadamard':
             return random_hadamard_matrix(self.hidden_size, self.dev)
         else:
             raise ValueError(f'Unsupported mode {self.mode}')
 
-    def block_transform(self, block):
+    def block_transform(self, block, ):
         logger.info(f'Start transform the {self.block_idx+1}-th block')
 
         if self.online_rotate:
-            self.replace_rotate_linears(block)
+            self.replace_rotate_fcs(block)
         subsets = self.model.get_subsets_in_block(block)
         for index, subset in enumerate(subsets):
             self.subset_transform(block, subset)
@@ -97,31 +87,18 @@ def subset_transform(self, block, subset):
             self.fuse_ln_fcs(prev_op[0], layers)
             self.rotate_pre_layers(layers, self.Q)
         else:
-            if self.config['model']['type'] in ['Opt', 'StableLm']:
-                self.bake_mean_into_fc(layers[0])
+            if self.config['model']['type'] in ['Opt']:
+                self.bake_mean_into_linear(layers[0])
 
             if 'is_mlp' in subset and subset['is_mlp']:
                 self.rotate_post_layers(
                     layers, self.Q, exact_had=True if self.online_rotate else False
                 )
             else:
-                for n, m in layers_dict.items():
-                    logger.info(f'layer: {n} {m.weight.shape}')
-                logger.info(f'{self.Q.shape}')
                 self.rotate_post_layers(layers, self.Q, exact_had=False)
                 if self.online_rotate:
+                    R2 = None
                     apply_exact_had_to_linear(
-                        prev_op[0], had_dim=self.head_dim, output=True
+                        prev_op[0], had_dim=self.head_dim, output=True, R2=R2
                     )
-                    apply_exact_had_to_linear(layers[0], had_dim=-1, output=False)
-
-    @torch.no_grad()
-    def save_model(self, path):
-        super().save_model(path)
-        path = os.path.join(path, 'config.json')
-        with open(path, 'r') as f:
-            config = json.load(f)
-        if 'tie_word_embeddings' in config:
-            config['tie_word_embeddings'] = False
-        with open(path, 'w') as f:
-            json.dump(config, f, indent=4)
+                    apply_exact_had_to_linear(layers[0], had_dim=-1, output=False, R2=R2)
diff --git a/llmc/compression/quantization/quik.py b/llmc/compression/quantization/quik.py
index 3a1e0441b..f57bf9576 100644
--- a/llmc/compression/quantization/quik.py
+++ b/llmc/compression/quantization/quik.py
@@ -12,8 +12,8 @@
 
 @ALGO_REGISTRY
 class QUIK(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         self.add_quant_config()
 
     def add_quant_config(self):
diff --git a/llmc/compression/quantization/rotate_utils.py b/llmc/compression/quantization/rotate_utils.py
new file mode 100644
index 000000000..1ed935701
--- /dev/null
+++ b/llmc/compression/quantization/rotate_utils.py
@@ -0,0 +1,102 @@
+import math
+
+import torch
+import torch.nn as nn
+from loguru import logger
+
+from .hadamard_utils import HadamardTransform, matmul_hadU_cuda
+
+
+class RotateModule(nn.Module):
+    def __init__(self, Q_init):
+        super(RotateModule, self).__init__()
+        self.weight = nn.Parameter(Q_init.to(torch.float32).to(torch.device('cuda')))
+
+    def forward(self, x, transpose=False):
+        if transpose:
+            return x @ self.weight
+        else:
+            return self.weight @ x
+
+
+class WeightRotater:
+    def __init__(self, weight_rotate_func, dev):
+        self.rotate_func = weight_rotate_func
+        self.dev = dev
+
+    def rotate(self, weight, bias, Q1, Q2, transpose):
+
+        if Q1 is not None:
+            tmp_weight, tmp_bias = self.rotate_func(weight, bias, Q1.weight, transpose)
+
+            if Q2 is not None:
+                had_dim = Q2.weight.shape[0]
+                dtype = tmp_weight.dtype
+                if transpose:
+                    init_shape = tmp_weight.shape
+                    tmp_weight = tmp_weight.reshape(-1, init_shape[-1] // had_dim, had_dim)
+                    tmp_weight, _ = self.rotate_func(tmp_weight, bias, Q2.weight, False)
+                    tmp_weight = tmp_weight.reshape(init_shape)
+                else:
+                    tmp_weight = tmp_weight.t()
+                    transposed_shape = tmp_weight.shape
+                    tmp_weight = tmp_weight.reshape(-1, transposed_shape[-1] // had_dim, had_dim)
+                    tmp_weight, _ = self.rotate_func(tmp_weight, bias, Q2.weight, False)
+                    tmp_weight = tmp_weight.reshape(transposed_shape).t()
+
+        if Q1 is None and Q2 is None:
+            tmp_weight = weight
+            tmp_bias = bias
+
+        tmp_weight = tmp_weight.to(self.dev)
+        tmp_bias = tmp_bias.to(self.dev) if tmp_bias is not None else None
+
+        return tmp_weight, tmp_bias
+
+
+class ActRotater:
+    def __init__(
+        self, online_full_had, online_partial_had, fp32_had, K, had_K=None, had_dim=None
+    ):
+        self.online_full_had = online_full_had
+        self.online_partial_had = online_partial_had
+        self.fp32_had = fp32_had
+        self.K = K
+        self.had_K = had_K
+        self.had_dim = had_dim
+
+    def rotate(self, x):
+        x_dtype = x.dtype
+
+        if self.online_full_had:
+            if self.fp32_had:
+                x = matmul_hadU_cuda(x.float(), self.had_K, self.K).to(x_dtype)
+            else:
+                x = matmul_hadU_cuda(x, self.had_K, self.K)
+
+        elif self.online_partial_had:
+            if self.fp32_had:
+                x = x.float()
+            init_shape = x.shape
+            if self.K == 1:
+                x = (
+                    HadamardTransform.apply(
+                        x.reshape(
+                            -1, init_shape[-1] // self.had_dim, self.had_dim
+                        ).transpose(1, 2)
+                    )
+                    / math.sqrt(init_shape[-1] // self.had_dim)
+                ).transpose(1, 2)
+            else:
+                self.had_K = self.had_K.to(x.device)
+
+                x = (
+                    self.had_K.to(x.dtype)
+                    @ x.reshape(-1, init_shape[-1] // self.had_dim, self.had_dim)
+                ) / math.sqrt(init_shape[-1] // self.had_dim)
+
+            if self.fp32_had:
+                x = x.to(x_dtype)
+            x = x.reshape(init_shape)
+
+        return x
diff --git a/llmc/compression/quantization/rtn.py b/llmc/compression/quantization/rtn.py
index aba208510..609b1c51d 100644
--- a/llmc/compression/quantization/rtn.py
+++ b/llmc/compression/quantization/rtn.py
@@ -8,8 +8,8 @@
 
 @ALGO_REGISTRY
 class RTN(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         if quant_config.get('act', False) and quant_config['act'].get('static', False):
             logger.info('Activation quant is static. Calibration is required.')
             self.act_static = True
@@ -18,8 +18,7 @@ def __init__(self, model, quant_config, input, padding_mask, config):
 
     @torch.no_grad()
     def block_opt(self, *opt_kwargs):
-        if self.act_static:
-            super().block_opt(*opt_kwargs)
+        pass
 
     def a_qdq(self, act, module, aquantizer):
         if self.act_static:
@@ -30,11 +29,11 @@ def a_qdq(self, act, module, aquantizer):
             args['zeros'] = (
                 module.buf_act_zeros if hasattr(module, 'buf_act_zeros') else None
             )
-            args['qmax'] = (
-                module.buf_act_qmax if hasattr(module, 'buf_act_qmax') else None
+            args['max_int'] = (
+                module.buf_act_max_int if hasattr(module, 'buf_act_max_int') else None
             )
-            args['qmin'] = (
-                module.buf_act_qmin if hasattr(module, 'buf_act_qmin') else None
+            args['min_int'] = (
+                module.buf_act_min_int if hasattr(module, 'buf_act_min_int') else None
             )
             return aquantizer.fake_quant_act_static(act, args)
         else:
@@ -60,18 +59,18 @@ def get_act_qparams(self, layers_dict, act_tensors):
                     avg_max_val = max_val / len(act_tensors)
                 else:
                     avg_max_val += max_val / len(act_tensors)
-        scales, zeros, qmax, qmin = self.aquantizer.get_qparams(
+        scales, zeros, max_int, min_int = self.aquantizer.get_qparams(
             (avg_min_val, avg_max_val), act_tensors[0].device
         )
         for name in layers_dict:
             layers_dict[name].register_buffer('buf_act_scales', scales)
             layers_dict[name].register_buffer('buf_act_zeros', zeros)
-            layers_dict[name].register_buffer('buf_act_qmax', qmax)
-            layers_dict[name].register_buffer('buf_act_qmin', qmin)
+            layers_dict[name].register_buffer('buf_act_max_int', max_int)
+            layers_dict[name].register_buffer('buf_act_min_int', min_int)
             logger.info(f'{name} act_scales : {scales}')
             logger.info(f'{name} act_zeros : {zeros}')
-            logger.info(f'{name} act_qmax : {qmax}')
-            logger.info(f'{name} act_qmin : {qmin}')
+            logger.info(f'{name} act_max_int : {max_int}')
+            logger.info(f'{name} act_min_int : {min_int}')
 
     @torch.no_grad()
     def subset_transform(
diff --git a/llmc/compression/quantization/smoothquant.py b/llmc/compression/quantization/smoothquant.py
index 706edc80d..2ba1905cd 100644
--- a/llmc/compression/quantization/smoothquant.py
+++ b/llmc/compression/quantization/smoothquant.py
@@ -11,10 +11,8 @@
 
 @ALGO_REGISTRY
 class SmoothQuant(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
-        special_config = self.quant_config.get('special', {})
-        self.alpha = special_config.get('alpha', 0.5)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
 
     @torch.no_grad()
     def filter_subset(self, subset):
@@ -55,7 +53,7 @@ def search_scale_subset(self, layers, tensors):
         w_max = self.get_weight_scale(layers)
         x_max = self.get_act_scale(tensors)
         x_max = x_max.to(dtype=w_max.dtype, device=w_max.device)
-        scale = (x_max.pow(self.alpha) / w_max.pow(self.alpha)).clamp(min=1e-5)
+        scale = (x_max.pow(0.5) / w_max.pow(0.5)).clamp(min=1e-5)
         return scale
 
     @torch.no_grad()
diff --git a/llmc/compression/quantization/spinquant.py b/llmc/compression/quantization/spinquant.py
new file mode 100644
index 000000000..82a1df238
--- /dev/null
+++ b/llmc/compression/quantization/spinquant.py
@@ -0,0 +1,231 @@
+import gc
+from functools import partial
+
+import torch
+import torch.nn as nn
+from loguru import logger
+
+from llmc.utils.registry_factory import ALGO_REGISTRY
+
+from .base_blockwise_quantization import BaseBlockwiseQuantization
+from .hadamard_utils import apply_exact_had_to_linear, random_hadamard_matrix
+from .module_utils import *
+from .module_utils import (_LLMC_LN_TYPES_, _TRANSFORMERS_LN_TYPES_,
+                           EffcientFakeQuantLinear, FakeQuantLinear,
+                           LlmcRMSNorm, OriginEmbedding, OriginFloatLinear,
+                           RotateEmbedding, RotateLinear)
+from .rotate_utils import ActRotater, RotateModule, WeightRotater
+
+
+@ALGO_REGISTRY
+class SpinQuant(BaseBlockwiseQuantization):
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
+        self.dev = torch.device('cuda')
+        self.add_quant_config()
+        self.preprocess()
+
+    def add_quant_config(self):
+        self.rotate_mode = self.quant_config['special']['rotate_mode']
+        self.weight_rotate = True
+        self.w_rotater = WeightRotater(weight_rotate_func=self.rotate_weight, dev=self.dev)
+        # self.o_proj_group_quant = self.quant_config['special']['o_proj_group_quant']
+
+    def preprocess(self):
+        for m in self.model.model.parameters():
+            m.requires_grad = False
+
+        assert self.config['model']['type'] in ['Opt', 'Llama']
+        # if self.config["model"]["type"] in ["Opt"]:
+        self.remove_mean_from_embed()
+
+        Q1 = self.get_orthogonal_matrix(self.hidden_size)
+        self.model.model.Q1 = RotateModule(Q1)
+
+        self.register_embed_spin_parameters()
+
+        pre_head_ln = self.model.get_pre_head_layernorm_layers()[0]
+        self.fuse_ln_fcs(pre_head_ln, self.model.get_head_layers())
+
+        self.model.replace_module_subset(
+            LlmcRMSNorm,
+            self.model.model,
+            {'layers': {'model.norm': pre_head_ln}},
+            None,
+            {},
+        )
+        self.register_lmhead_spin_parameters()
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_trainable_params(self):
+        trainable_parameters = []
+        for n, m in self.model.model.named_parameters():
+            if 'Q1' in n or 'Q2' in n:
+                trainable_parameters.append(m)
+        return trainable_parameters
+
+    def a_rot(self, act, module, a_rotater):
+        return a_rotater.rotate(act)
+
+    def w_rot(self, module, w_rotater, args):
+        return w_rotater.rotate(module.weight, module.bias, args['Q1'], args['Q2'], args['transpose'])
+
+    def w_qdq_tmp(self, module, wquantizer):
+        args = {'lowbound_factor': None, 'upbound_factor': None}
+        if hasattr(module, 'buf_lowbound_factor'):
+            args['lowbound_factor'] = module.buf_lowbound_factor
+        if hasattr(module, 'buf_upbound_factor'):
+            args['upbound_factor'] = module.buf_upbound_factor
+
+        return wquantizer.fake_quant_weight_dynamic(module.tmp_weight, args)
+
+    def register_embed_spin_parameters(self):
+        embedding_layer = self.model.get_embed_layers()[0]
+        args = {}
+        args['Q1'] = self.model.model.Q1
+        args['Q2'] = None
+        args['transpose'] = False
+        params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=None, args=args)
+        params_dict.pop('a_rot')
+        self.model.replace_module_subset(
+            RotateEmbedding,
+            self.model.model,
+            {'layers': {'model.embed_tokens': embedding_layer}},
+            None,
+            params_dict
+        )
+        self.model.find_embed_layers()
+
+    def register_lmhead_spin_parameters(self):
+        lm_head_layer = self.model.get_head_layers()[0]
+        args = {}
+        args['Q1'] = self.model.model.Q1
+        args['Q2'] = None
+        args['transpose'] = False
+        params_dict = self.get_replacement_params(mode='rotate', w_only=self.w_only, name=None, args=args)
+        self.model.replace_module_subset(
+            RotateLinear,
+            self.model.model,
+            {'layers': {'lm_head': lm_head_layer}},
+            None,
+            params_dict
+        )
+
+    def apply_fc_rotate_weight(self):
+        for idx, block in enumerate(self.blocks):
+            logger.info(f'Start apply {idx}-th block rotate weights')
+            for name, module in block.named_modules():
+                if isinstance(module, (RotateLinear, FakeQuantLinear)):
+                    weight, bias = module._rotate_weight()
+                    module.weight, module.bias = weight, bias
+            logger.info(f'End apply {idx}-th block rotate weights')
+
+    def apply_embedding_rotate_weight(self):
+
+        embedding_layer = self.model.get_embed_layers()[0]
+        if isinstance(embedding_layer, RotateEmbedding):
+            weight = embedding_layer._rotate_weight()
+            embedding_layer.weight.data = weight
+            self.model.replace_module_subset(
+                OriginEmbedding,
+                self.model.model,
+                {'layers': {'model.embed_tokens': embedding_layer}},
+                None,
+                {}
+            )
+
+    def apply_lmhead_rotate_weight(self):
+        lm_head_layer = self.model.get_head_layers()[0]
+        if isinstance(lm_head_layer, RotateLinear):
+            weight, bias = lm_head_layer._rotate_weight()
+            lm_head_layer.weight, lm_head_layer.bias = weight, bias
+            self.model.replace_module_subset(
+                OriginFloatLinear,
+                self.model.model,
+                {'layers': {'lm_head': lm_head_layer}},
+                None,
+                {}
+            )
+
+
+    def get_orthogonal_matrix(self, size):
+        if self.rotate_mode == 'random':
+            return random_orthogonal_matrix(size, self.dev)
+        elif self.rotate_mode == 'hadamard':
+            return random_hadamard_matrix(size, self.dev)
+        else:
+            raise ValueError(f'Unsupported mode {self.mode}')
+
+    def block_transform(self, block):
+        logger.info(f'Start transform the {self.block_idx+1}-th block')
+
+        subsets = self.model.get_subsets_in_block(block)
+        for index, subset in enumerate(subsets):
+            self.subset_transform(block, subset)
+
+        self.model.replace_module_block(LlmcRMSNorm, block, self.block_idx, {})
+
+        logger.info(f'block:{block}')
+        logger.info(f'End transform the {self.block_idx+1}-th block')
+
+    def subset_transform(self, block, subset):
+        prev_op = subset['prev_op']
+        layers_dict = subset['layers']
+        assert (
+            len(prev_op) == 1
+        ), 'Only support single prev_op. If multi prev_ops, code need to be updated.'
+
+        layers = list(layers_dict.values())
+
+        if isinstance(prev_op[0], tuple(_LLMC_LN_TYPES_ + _TRANSFORMERS_LN_TYPES_)):
+            self.fuse_ln_fcs(prev_op[0], layers)
+            for n in layers_dict.keys():
+                m = layers_dict[n]
+                self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=None, transpose=False)
+            if 'is_mlp' not in subset or not subset['is_mlp']:
+                Q2 = self.get_orthogonal_matrix(self.hidden_size // self.num_heads)
+                subset['inspect'].Q2 = RotateModule(Q2)
+
+        else:
+            if self.config['model']['type'] in ['Opt']:
+                self.bake_mean_into_linear(layers[0])
+
+            n = list(layers_dict.keys())[0]
+            m = layers[0]
+            if 'is_mlp' in subset and subset['is_mlp']:
+                if self.online_rotate:
+                    apply_exact_had_to_linear(m, had_dim=-1, output=False)
+                self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=None, transpose=True)
+            else:
+                self.replace_rotate_fc(block, n, m, Q1=self.model.model.Q1, Q2=block.self_attn.Q2, transpose=True)
+                self.replace_rotate_fc(block, 'self_attn.v_proj', prev_op[0], Q1=self.model.model.Q1, Q2=block.self_attn.Q2, transpose=False)
+
+    def apply_rotate_weight(self):
+        self.apply_embedding_rotate_weight()
+        self.apply_lmhead_rotate_weight()
+        self.apply_fc_rotate_weight()
+
+    def deploy(self, quant_format):
+        if quant_format == 'train_rotate_quant':
+            logger.info(f'-- deploy_{quant_format}_model start --')
+            logger.info(f'quant_config : {self.quant_config}')
+            logger.info(self.model.model)
+
+            params_dict = {}
+            params_dict['w_qdq'] = partial(self.w_qdq_tmp, wquantizer=self.wquantizer)
+            params_dict['a_qdq'] = (
+                partial(self.a_qdq, aquantizer=self.aquantizer)
+                if not self.w_only
+                else None
+            )
+            self.model.replace_module_all(
+                FakeQuantLinear, params_dict
+            )
+
+            logger.info(f'-- deploy_{quant_format}_model done --')
+            logger.info(f'-- strat train rotation--')
+        else:
+            self.apply_rotate_weight()
+            super().deploy(quant_format)
diff --git a/llmc/compression/quantization/spqr.py b/llmc/compression/quantization/spqr.py
index 51ee90742..559385669 100644
--- a/llmc/compression/quantization/spqr.py
+++ b/llmc/compression/quantization/spqr.py
@@ -13,13 +13,13 @@
 
 from .base_blockwise_quantization import BaseBlockwiseQuantization
 from .module_utils import FakeQuantLinear
-from .quant import IntegerQuantizer
+from .quant import Quantizer
 
 
 @ALGO_REGISTRY
 class SpQR(BaseBlockwiseQuantization):
-    def __init__(self, model, quant_config, input, padding_mask, config):
-        super().__init__(model, quant_config, input, padding_mask, config)
+    def __init__(self, model, quant_config, input, config):
+        super().__init__(model, quant_config, input, config)
         assert (
             self.wquantizer.granularity == 'per_group'
         ), 'SpQR only supports per_group quantization'
@@ -50,11 +50,9 @@ def add_quant_config(self):
         scale_config = special_config['scale']
         zero_config = special_config['zero']
 
-        self.quant_type = self.quant_config.get('quant_type', 'int_quant')
-        assert self.quant_type != 'float_quant', 'SPQR do not support Float quant now.'
-        self.scale_quantizer = IntegerQuantizer(**scale_config)
-        self.zero_quantizer = IntegerQuantizer(**zero_config)
-        self.Q = IntegerQuantizer(
+        self.scale_quantizer = Quantizer(**scale_config)
+        self.zero_quantizer = Quantizer(**zero_config)
+        self.Q = Quantizer(
             self.wquantizer.bit, self.wquantizer.sym, 'per_channel', round_zp=False
         )
 
@@ -232,8 +230,8 @@ def outliers(G, HinvGD):
                     W[:, i].unsqueeze(1),
                     self.qparams['scales'],
                     self.qparams['zeros'],
-                    self.qparams['qmax'],
-                    self.qparams['qmin'],
+                    self.qparams['max_int'],
+                    self.qparams['min_int'],
                 ).squeeze(1)
 
                 err = (W[:, i] - q) / Hinv[i, i]
@@ -323,24 +321,24 @@ def merge_qparams(self, qparams):
     def get_group_qparams(self, c_tensor, idx):
         """get qparams for a group, idx is the index of a column within a
         group, c_tensor is a group."""
-        _, s, z, qmax, qmin = self.wquantizer.get_tensor_qparams(c_tensor)
+        _, s, z, max_int, min_int = self.wquantizer.get_tensor_qparams(c_tensor)
         _, ss, zs, Ps, Ns = self.scale_quantizer.get_tensor_qparams(s)
         args = {}
         args['scales'] = ss
         args['zeros'] = zs
-        args['qmin'] = Ns
-        args['qmax'] = Ps
+        args['min_int'] = Ns
+        args['max_int'] = Ps
         scales = self.scale_quantizer.fake_quant_weight_static(s.data, args)
         _, sz, zz, Pz, Nz = self.zero_quantizer.get_tensor_qparams(z)
         args['scales'] = sz
         args['zeros'] = zz
-        args['qmin'] = Nz
-        args['qmax'] = Pz
+        args['min_int'] = Nz
+        args['max_int'] = Pz
         zeros = self.zero_quantizer.fake_quant_weight_static(z.data, args)
         self.qparams['scales'] = scales
         self.qparams['zeros'] = zeros
-        self.qparams['qmax'] = qmax
-        self.qparams['qmin'] = qmin
+        self.qparams['max_int'] = max_int
+        self.qparams['min_int'] = min_int
         qparams = copy.deepcopy(self.qparams)
         self.groups[idx // self.wquantizer.group_size] = qparams
 
@@ -351,8 +349,8 @@ def set_model_qparams(self, layer):
         d['zeros'] = self.merge_qparams([g['zeros'] for g in self.groups])
         for k, v in d.items():
             layer.register_buffer('buf_' + k, copy.deepcopy(v))
-        layer.register_buffer('buf_qmax', torch.tensor(self.groups[0]['qmax']))
-        layer.register_buffer('buf_qmin', torch.tensor(self.groups[0]['qmin']))
+        layer.register_buffer('buf_max_int', torch.tensor(self.groups[0]['max_int']))
+        layer.register_buffer('buf_min_int', torch.tensor(self.groups[0]['min_int']))
 
     @torch.no_grad()
     def free(self, name):
@@ -375,8 +373,8 @@ def w_qdq(self, module, wquantizer):
         args = {}
         args['scales'] = module.buf_scales
         args['zeros'] = module.buf_zeros
-        args['qmax'] = module.buf_qmax
-        args['qmin'] = module.buf_qmin
+        args['max_int'] = module.buf_max_int
+        args['min_int'] = module.buf_min_int
 
         weight = wquantizer.fake_quant_weight_static(weight, args).to(self.model_dtype)
 
diff --git a/llmc/compression/quantization/train_utils.py b/llmc/compression/quantization/train_utils.py
index 1f941f1a8..2c69733d5 100644
--- a/llmc/compression/quantization/train_utils.py
+++ b/llmc/compression/quantization/train_utils.py
@@ -1,11 +1,15 @@
 import os
+import random
 import sys
 import time
+from dataclasses import dataclass, field
 from math import inf
 
 import torch
 import torch.nn as nn
+import transformers
 from loguru import logger
+from torch.optim.optimizer import Optimizer
 
 
 class TruncateFunction(torch.autograd.Function):
@@ -105,3 +109,186 @@ def ampscaler_get_grad_norm(self, parameters, norm_type=2.0):
                 norm_type,
             )
         return total_norm
+
+
+def unit(v, dim: int = 1, eps: float = 1e-8):
+    vnorm = norm(v, dim)
+    return v / vnorm.add(eps), vnorm
+
+
+def norm(v, dim: int = 1):
+    assert len(v.size()) == 2
+    return v.norm(p=2, dim=dim, keepdim=True)
+
+
+def matrix_norm_one(W):
+    out = torch.abs(W)
+    out = torch.sum(out, dim=0)
+    out = torch.max(out)
+    return out
+
+
+def Cayley_loop(X, W, tan_vec, t):  #
+    [n, p] = X.size()
+    Y = X + t * tan_vec
+    for i in range(5):
+        Y = X + t * torch.matmul(W, 0.5 * (X + Y))
+
+    return Y.t()
+
+
+def qr_retraction(tan_vec):  # tan_vec, p-by-n, p <= n
+    [p, n] = tan_vec.size()
+    tan_vec.t_()
+    q, r = torch.linalg.qr(tan_vec)
+    d = torch.diag(r, 0)
+    ph = d.sign()
+    q *= ph.expand_as(q)
+    q.t_()
+
+    return q
+
+
+class SGDG(Optimizer):
+    r"""This optimizer updates variables with two different routines
+        based on the boolean variable 'stiefel'.
+
+        If stiefel is True, the variables will be updated by SGD-G proposed
+        as decorrelated weight matrix.
+
+        If stiefel is False, the variables will be updated by SGD.
+        This routine was taken from https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+
+        -- common parameters
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        stiefel (bool, optional): whether to use SGD-G (default: False)
+
+        -- parameters in case stiefel is False
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+        -- parameters in case stiefel is True
+        omega (float, optional): orthogonality regularization factor (default: 0)
+        grad_clip (float, optional): threshold for gradient norm clipping (default: None)
+    """
+
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum: int = 0,
+        dampening: int = 0,
+        weight_decay: int = 0,
+        nesterov: bool = False,
+        stiefel: bool = False,
+        omega: int = 0,
+        grad_clip=None,
+    ) -> None:
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            stiefel=stiefel,
+            omega=0,
+            grad_clip=grad_clip,
+        )
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError('Nesterov momentum requires a momentum and zero dampening')
+        super(SGDG, self).__init__(params, defaults)
+
+    def __setstate__(self, state) -> None:
+        super(SGDG, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    def step(self, closure=None, episilon = 1e-8):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            momentum = group['momentum']
+            stiefel = group['stiefel']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                unity, _ = unit(p.data.view(p.size()[0], -1))
+                if stiefel and unity.size()[0] <= unity.size()[1]:
+                    weight_decay = group['weight_decay']
+                    dampening = group['dampening']
+                    nesterov = group['nesterov']
+
+                    rand_num = random.randint(1, 101)
+                    if rand_num == 1:
+                        unity = qr_retraction(unity)
+
+                    g = p.grad.data.view(p.size()[0], -1)
+
+                    lr = group['lr']
+
+                    param_state = self.state[p]
+                    if 'momentum_buffer' not in param_state:
+                        param_state['momentum_buffer'] = torch.zeros(g.t().size())
+                        if p.is_cuda:
+                            param_state['momentum_buffer'] = param_state[
+                                'momentum_buffer'
+                            ].cuda()
+
+                    V = param_state['momentum_buffer']
+                    V = momentum * V - g.t()
+                    MX = torch.mm(V, unity)
+                    XMX = torch.mm(unity, MX)
+                    XXMX = torch.mm(unity.t(), XMX)
+                    W_hat = MX - 0.5 * XXMX
+                    W = W_hat - W_hat.t()
+                    t = 0.5 * 2 / (matrix_norm_one(W) + episilon)
+                    alpha = min(t, lr)
+
+                    p_new = Cayley_loop(unity.t(), W, V, alpha)
+                    V_new = torch.mm(W, unity.t())  # n-by-p
+                    #                     check_identity(p_new.t())
+                    p.data.copy_(p_new.view(p.size()))
+                    V.copy_(V_new)
+
+                else:
+                    d_p = p.grad.data
+                    #  defined.
+                    try:
+                        if weight_decay != 0:
+                            #  defined.
+                            d_p.add_(weight_decay, p.data)
+                    except:
+                        pass
+                    if momentum != 0:
+                        param_state = self.state[p]
+                        if 'momentum_buffer' not in param_state:
+                            buf = param_state['momentum_buffer'] = d_p.clone()
+                        else:
+                            buf = param_state['momentum_buffer']
+                            #  always defined.
+                            buf.mul_(momentum).add_(1 - dampening, d_p)
+                        #  defined.
+                        if nesterov:
+                            d_p = d_p.add(momentum, buf)
+                        else:
+                            d_p = buf
+
+                    p.data.add_(-group['lr'], d_p)
+
+        return loss
diff --git a/llmc/compression/quantization/utils.py b/llmc/compression/quantization/utils.py
index b7975249d..a85f20382 100644
--- a/llmc/compression/quantization/utils.py
+++ b/llmc/compression/quantization/utils.py
@@ -54,22 +54,3 @@ def check_w_only(
             ]
             return quantizer_mix_bits_this_layer['w_only_mix_bits']
     return default_w_only
-
-
-def make_divisible(c, divisor):
-    return (c + divisor - 1) // divisor
-
-
-def calculate_zeros_width(in_features, group_size=128, pack_num=8):
-    if group_size >= 128:
-        size_multiplier = 1
-    elif group_size == 64:
-        size_multiplier = 2
-    elif group_size == 32:
-        size_multiplier = 4
-    else:
-        raise NotImplementedError
-
-    base_width = make_divisible(in_features // group_size, pack_num)
-    base_width = make_divisible(base_width, size_multiplier) * size_multiplier
-    return base_width
diff --git a/llmc/compression/sparsification/base_blockwise_sparsification.py b/llmc/compression/sparsification/base_blockwise_sparsification.py
index e62f5f272..607e244fa 100644
--- a/llmc/compression/sparsification/base_blockwise_sparsification.py
+++ b/llmc/compression/sparsification/base_blockwise_sparsification.py
@@ -12,8 +12,8 @@
 
 
 class BaseBlockwiseSparsification(BlockwiseOpt):
-    def __init__(self, model, sparsity_config, input, padding_mask, config):
-        super().__init__(model, sparsity_config, input, padding_mask, config)
+    def __init__(self, model, sparsity_config, input, config):
+        super().__init__(model, sparsity_config, input, config)
         self.set_sparsity_config()
 
     def block_init(self, block):
diff --git a/llmc/compression/sparsification/magnitude.py b/llmc/compression/sparsification/magnitude.py
index 8f36b295d..57ad23a8b 100644
--- a/llmc/compression/sparsification/magnitude.py
+++ b/llmc/compression/sparsification/magnitude.py
@@ -8,8 +8,8 @@
 
 @ALGO_REGISTRY
 class Magnitude(BaseBlockwiseSparsification):
-    def __init__(self, model, sparsity_config, input, padding_mask, config):
-        super().__init__(model, sparsity_config, input, padding_mask, config)
+    def __init__(self, model, sparsity_config, input, config):
+        super().__init__(model, sparsity_config, input, config)
 
     @torch.no_grad()
     def subset_transform(
diff --git a/llmc/compression/sparsification/shortgpt.py b/llmc/compression/sparsification/shortgpt.py
index c8c8dc410..64aadd9f9 100644
--- a/llmc/compression/sparsification/shortgpt.py
+++ b/llmc/compression/sparsification/shortgpt.py
@@ -17,8 +17,8 @@
 
 @ALGO_REGISTRY
 class ShortGPT(BaseBlockwiseSparsification):
-    def __init__(self, model, sparsity_config, input, padding_mask, config):
-        super().__init__(model, sparsity_config, input, padding_mask, config)
+    def __init__(self, model, sparsity_config, input, config):
+        super().__init__(model, sparsity_config, input, config)
 
     def block_opt(self, block):
         block = block.cuda()
diff --git a/llmc/compression/sparsification/wanda.py b/llmc/compression/sparsification/wanda.py
index 951e58dab..1cdbc1e76 100644
--- a/llmc/compression/sparsification/wanda.py
+++ b/llmc/compression/sparsification/wanda.py
@@ -9,12 +9,12 @@
 
 @ALGO_REGISTRY
 class Wanda(BaseBlockwiseSparsification):
-    def __init__(self, model, sparsity_config, input, padding_mask, config):
-        super().__init__(model, sparsity_config, input, padding_mask, config)
+    def __init__(self, model, sparsity_config, input, config):
+        super().__init__(model, sparsity_config, input, config)
 
     @torch.no_grad()
     def get_row_scale(self, layer, act):
-        if len(act.shape) == 2:
+        if len(act) == 2:
             act = act.unsqueeze(0)
         nsamples = act.shape[0]
         if isinstance(layer, nn.Linear):
diff --git a/llmc/data/__init__.py b/llmc/data/__init__.py
index 12ec02b05..fd0e40018 100644
--- a/llmc/data/__init__.py
+++ b/llmc/data/__init__.py
@@ -1,2 +1,2 @@
-from .dataset import BaseDataset
+from .dataset import BaseDataset, TrainJsonDataset
 from .tokenizer import BaseTokenizer
diff --git a/llmc/data/dataset/__init__.py b/llmc/data/dataset/__init__.py
index b1933afee..bb5057b94 100644
--- a/llmc/data/dataset/__init__.py
+++ b/llmc/data/dataset/__init__.py
@@ -1 +1,2 @@
 from .base_dataset import BaseDataset
+from .train_dataset import TrainJsonDataset
diff --git a/llmc/data/dataset/base_dataset.py b/llmc/data/dataset/base_dataset.py
index 5e48c6ec4..8cfb36a15 100644
--- a/llmc/data/dataset/base_dataset.py
+++ b/llmc/data/dataset/base_dataset.py
@@ -1,34 +1,25 @@
-import json
-import os
 from abc import ABCMeta
 
 import torch
 from datasets import load_dataset, load_from_disk
 from loguru import logger
-from PIL import Image
-from torch.nn import functional as F
 
 from .specified_preproc import PREPROC_REGISTRY
 
 
 class BaseDataset(metaclass=ABCMeta):
-    def __init__(self, tokenizer, calib_cfg, processor=None):
+    def __init__(self, tokenizer, calib_cfg):
         # calib_cfg
         logger.info(f'calib_cfg : {calib_cfg}')
         self.tokenizer = tokenizer
-        self.processor = processor
         self.calib_dataset_name = calib_cfg['name']
-        self.calib_dataset_type = calib_cfg.get('type', 'txt')
-        self.padding = calib_cfg.get('padding', False)
         self.download = calib_cfg['download']
         self.load_from_txt = calib_cfg.get('load_from_txt', False)
         self.calib_dataset_path = calib_cfg.get('path', None)
         self.n_samples = calib_cfg['n_samples']
         self.calib_bs = calib_cfg['bs']
-        self.seq_len = calib_cfg.get('seq_len', None)
+        self.seq_len = calib_cfg['seq_len']
         self.preproc = calib_cfg['preproc']
-        if self.preproc == 'original_txt':
-            assert self.seq_len is None
         self.seed = calib_cfg['seed']
         self.dataset_key = {
             'pileval': 'text',
@@ -41,220 +32,61 @@ def __init__(self, tokenizer, calib_cfg, processor=None):
         self.build_calib_dataset()
 
     def build_calib_dataset(self):
-        if self.calib_dataset_type == 'txt':
-            if self.download:
-                if self.calib_dataset_name == 'pileval':
-                    self.calib_dataset = load_dataset(
-                        'mit-han-lab/pile-val-backup', split='validation'
-                    )
-                elif self.calib_dataset_name == 'c4':
-                    self.calib_dataset = load_dataset(
-                        'allenai/c4',
-                        data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
-                        split='train',
-                    )
-                elif self.calib_dataset_name == 'wikitext2':
-                    self.calib_dataset = load_dataset(
-                        'wikitext', 'wikitext-2-raw-v1', split='train'
-                    )
-                elif self.calib_dataset_name == 'ptb':
-                    self.calib_dataset = load_dataset(
-                        'ptb_text_only', 'penn_treebank', split='train'
-                    )
-                else:
-                    raise Exception(f'Not support {self.calib_dataset_name} dataset.')
+        if self.download:
+            if self.calib_dataset_name == 'pileval':
+                self.calib_dataset = load_dataset(
+                    'mit-han-lab/pile-val-backup', split='validation'
+                )
+            elif self.calib_dataset_name == 'c4':
+                self.calib_dataset = load_dataset(
+                    'allenai/c4',
+                    data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
+                    split='train',
+                )
+            elif self.calib_dataset_name == 'wikitext2':
+                self.calib_dataset = load_dataset(
+                    'wikitext', 'wikitext-2-raw-v1', split='train'
+                )
+            elif self.calib_dataset_name == 'ptb':
+                self.calib_dataset = load_dataset(
+                    'ptb_text_only', 'penn_treebank', split='train'
+                )
             else:
-                if not self.load_from_txt:
-                    # Need to pre-download the dataset.
-                    self.calib_dataset = load_from_disk(self.calib_dataset_path)
-                else:
-                    """Load dataset from your custom txt file.
-
-                    Each line in the txt file represents one input text data.
-                    """
-                    assert self.calib_dataset_path.endswith('.txt')
-                    logger.info(f'calib_dataset_path: {self.calib_dataset_path}')
-                    with open(self.calib_dataset_path, 'r') as fp:
-                        lines = fp.readlines()
-                    self.calib_dataset = []
-                    for line in lines:
-                        self.calib_dataset.append(line.strip())
-        elif self.calib_dataset_type == 'img_txt':
-            self.calib_dataset = []
-            logger.info(f'calib_dataset_path: {self.calib_dataset_path}')
-            for root, _, files in os.walk(self.calib_dataset_path):
-                for name in files:
-                    if name.endswith('.jpg') or name.endswith('.png'):
-                        img_path = os.path.join(root, name)
-                        qa_path = os.path.join(root, name.split('.')[0] + '.json')
-                        try:
-                            with open(qa_path, 'r') as json_file:
-                                data = json.load(json_file)
-                            for qa in data:
-                                question = qa['question']
-                                gt_answer = qa['answer']
-                                prompt = (
-                                    f'USER: <image>\n{question}\nASSISTANT: {gt_answer}'
-                                )
-                                raw_image = Image.open(img_path)
-                                self.calib_dataset.append((prompt, raw_image))
-                        except FileNotFoundError:
-                            logger.warning(f'QA file not found for image: {img_path}')
-                        except Exception as e:
-                            logger.error(
-                                f'Error processing image {img_path} and'
-                                f'QA file {qa_path}: {e}'
-                            )
-        elif self.calib_dataset_type == 'img':
-            self.calib_dataset = []
-            logger.info(f'calib_dataset_path: {self.calib_dataset_path}')
-            for root, _, files in os.walk(self.calib_dataset_path):
-                for name in files:
-                    if name.endswith(('.jpg', '.png', '.JPEG')):
-                        img_path = os.path.join(root, name)
-                        raw_image = Image.open(img_path).convert('RGB')
-                        self.calib_dataset.append(raw_image)
-                        if len(self.calib_dataset) == self.n_samples:
-                            return
+                raise Exception(f'Not support {self.calib_dataset_name} dataset.')
         else:
-            raise ValueError(f'Unsupported data type: {self.calib_dataset_type}')
+            if not self.load_from_txt:
+                # Need to pre-download the dataset.
+                self.calib_dataset = load_from_disk(self.calib_dataset_path)
+            else:
+                """Load dataset from your custom txt file.
+
+                Each line in the txt file represents one input text data.
+                """
+                assert self.calib_dataset_path.endswith('.txt')
+                logger.info(f'calib_dataset_path: {self.calib_dataset_path}')
+                with open(self.calib_dataset_path, 'r') as fp:
+                    lines = fp.readlines()
+                self.calib_dataset = []
+                for line in lines:
+                    self.calib_dataset.append(line.strip())
 
     def get_calib_samples(self):
         if self.preproc == 'general':
             samples = self.general_preproc(
                 self.calib_dataset, self.tokenizer, self.n_samples, self.seq_len
             )
-        elif self.preproc.startswith(('vlm_', 'img_')):
-            preproc = PREPROC_REGISTRY[self.preproc]
-            samples = preproc(self.calib_dataset, self.processor, self.n_samples)
         else:
             preproc = PREPROC_REGISTRY[self.preproc]
             samples = preproc(
-                self.calib_dataset, self.tokenizer,
-                self.n_samples, self.seq_len
+                self.calib_dataset, self.tokenizer, self.n_samples, self.seq_len
             )
         return samples
 
-    def txt_group_samples_with_mask(self, samples):
-        calib_samples = []
-        input_ids = []
-        attention_mask = []
-        pad_token_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
-        if self.calib_bs < 0:
-            samples_len = [sample.shape[-1] for sample in samples]
-            max_len = max(samples_len)
-            samples_tmp = []
-            attention_mask_tmp = []
-            for sample in samples:
-                samples_tmp.append(
-                    F.pad(sample, [0, max_len - sample.shape[-1]], value=pad_token_id)
-                )
-                attention_mask_tmp.append(
-                    F.pad(
-                        torch.ones(1, sample.shape[-1], dtype=torch.int64),
-                        [0, max_len - sample.shape[-1]],
-                        value=0
-                    )
-                )
-            batch_input_ids = torch.cat(samples_tmp, dim=0)
-            batch_attention_mask = torch.cat(attention_mask_tmp, dim=0)
-            calib_samples.append(
-                {'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask}
-            )
-        elif self.calib_bs == 1:
-            input_ids = samples
-            attention_mask = [torch.ones(1, sample.shape[-1], dtype=torch.int64) for sample in samples] # noqa
-            for i in range(len(samples)):
-                calib_samples.append(
-                    {'input_ids': input_ids[i], 'attention_mask': attention_mask[i]}
-                )
-        elif self.calib_bs > 1:
-            for i in range(0, len(samples), self.calib_bs):
-                start = i
-                end = min(i + self.calib_bs, len(samples))
-                batch_samples = samples[start:end]
-                batch_samples_len = [sample.shape[-1] for sample in batch_samples]
-                batch_max_len = max(batch_samples_len)
-                samples_tmp = []
-                attention_mask_tmp = []
-                for sample in batch_samples:
-                    samples_tmp.append(
-                        F.pad(
-                            sample,
-                            [0, batch_max_len - sample.shape[-1]],
-                            value=pad_token_id
-                        )
-                    )
-                    attention_mask_tmp.append(
-                        F.pad(
-                            torch.ones(1, sample.shape[-1], dtype=torch.int64),
-                            [0, batch_max_len - sample.shape[-1]],
-                            value=0
-                        )
-                    )
-                batch_input_ids = torch.cat(samples_tmp, dim=0)
-                batch_attention_mask = torch.cat(attention_mask_tmp, dim=0)
-                calib_samples.append(
-                    {
-                        'input_ids': batch_input_ids,
-                        'attention_mask': batch_attention_mask
-                    }
-                )
-        return calib_samples
-
-    def txt_group_samples_wo_mask(self, samples):  # without mask
+    def get_calib_dataset(self):
+        samples = self.get_calib_samples()
         calib_samples = []
         if self.calib_bs < 0:
             batch = torch.cat(samples, dim=0)
-            calib_samples.append({'input_ids': batch})
-        elif self.calib_bs == 1:
-            for i in range(len(samples)):
-                calib_samples.append({'input_ids': samples[i]})
-        elif self.calib_bs > 1:
-            for i in range(0, len(samples), self.calib_bs):
-                start = i
-                end = min(i + self.calib_bs, len(samples))
-                batch = samples[start:end]
-                batch = torch.cat(batch, dim=0)
-                calib_samples.append({'input_ids': batch})
-        return calib_samples
-
-    def img_txt_group_samples_wo_mask(self, samples):  # without mask
-        calib_samples = []
-        if self.calib_bs < 0:
-            batch = self.processor(
-                text=samples['prompts'],
-                images=samples['raw_images'],
-                return_tensors='pt',
-                padding=True
-            )
-            calib_samples.append(batch)
-        elif self.calib_bs == 1:
-            for prompt, raw_image in zip(samples['prompts'], samples['raw_images']):
-                batch = self.processor(
-                    text=prompt,
-                    images=raw_image,
-                    return_tensors='pt'
-                )
-                calib_samples.append(batch)
-        elif self.calib_bs > 1:
-            for i in range(0, len(samples['prompts']), self.calib_bs):
-                start = i
-                end = min(i + self.calib_bs, len(samples['prompts']))
-                batch = self.processor(
-                    text=samples['prompts'][start:end],
-                    images=samples['raw_images'][start:end],
-                    return_tensors='pt',
-                    padding=True
-                )
-                calib_samples.append(batch)
-        return calib_samples
-
-    def img_group_samples_wo_mask(self, samples):  # without mask
-        calib_samples = []
-        if self.calib_bs < 0:
-            batch = {'pixel_values': torch.cat([sample['pixel_values']
-                                                for sample in samples], dim=0)}
             calib_samples.append(batch)
         elif self.calib_bs == 1:
             calib_samples = samples
@@ -263,45 +95,10 @@ def img_group_samples_wo_mask(self, samples):  # without mask
                 start = i
                 end = min(i + self.calib_bs, len(samples))
                 batch = samples[start:end]
-                batch = {'pixel_values': torch.cat([sample['pixel_values']
-                                                    for sample in batch], dim=0)}
+                batch = torch.cat(batch, dim=0)
                 calib_samples.append(batch)
-        return calib_samples
-
-    def get_calib_dataset(self):
-        samples = self.get_calib_samples()
-        if self.calib_dataset_type in ['txt', 'img']:
-            logger.info(f'len(samples) all : {len(samples)}')
-            assert len(samples) % int(os.environ['WORLD_SIZE']) == 0
-            samples = samples[int(os.environ['RANK'])::int(os.environ['WORLD_SIZE'])]
-            logger.info(f'len(samples) rank : {len(samples)}')
-        elif self.calib_dataset_type == 'img_txt':
-            samples_len = len(samples['prompts'])
-            logger.info(f'len(samples) all : {samples_len}')
-            assert samples_len % int(os.environ['WORLD_SIZE']) == 0
-            rank = int(os.environ['RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            samples = {
-                'prompts': samples['prompts'][rank::world_size],
-                'raw_images': samples['raw_images'][rank::world_size]
-            }
-            logger.info(f'len(samples) rank : {samples_len}')
-        calib_samples = []
-        if self.calib_dataset_type == 'txt':
-            if self.padding:
-                calib_samples = self.txt_group_samples_with_mask(samples)
-            else:
-                calib_samples = self.txt_group_samples_wo_mask(samples)
-        elif self.calib_dataset_type == 'img':
-            calib_samples = self.img_group_samples_wo_mask(samples)
-        elif self.calib_dataset_type == 'img_txt':
-            calib_samples = self.img_txt_group_samples_wo_mask(samples)
         logger.info(f'len(calib_samples) : {len(calib_samples)}')
-        if self.padding:
-            padding_mask = [calib_sample['attention_mask'] for calib_sample in calib_samples] # noqa
-        else:
-            padding_mask = None
-        return calib_samples, padding_mask
+        return calib_samples
 
     def general_preproc(self, calib_dataset, tokenizer, n_samples, seq_len):
         dataset = calib_dataset.shuffle(seed=self.seed)
diff --git a/llmc/data/dataset/specified_preproc.py b/llmc/data/dataset/specified_preproc.py
index ab7372a09..acbf51a76 100644
--- a/llmc/data/dataset/specified_preproc.py
+++ b/llmc/data/dataset/specified_preproc.py
@@ -96,81 +96,7 @@ def pileval_omni(calib_dataset, tokenizer, n_samples, seq_len):
         j = i + seq_len
         inp = trainenc.input_ids[:, i:j]
         samples.append(inp)
-    return samples
-
-
-@PREPROC_REGISTRY
-def vlm_native(calib_dataset, processor, n_samples):
-    random.shuffle(calib_dataset)
-    samples = {
-        'prompts': [],
-        'raw_images': []
-    }
-    n_run = 0
-    for data in calib_dataset:
-        prompt, raw_image = data
-        samples['prompts'].append(prompt)
-        samples['raw_images'].append(raw_image)
-        n_run += 1
-        if n_run == n_samples:
-            break
-    return samples
-
-
-@PREPROC_REGISTRY
-def vlm_divide_equal(calib_dataset, processor, n_samples):
-    samples_native = vlm_native(calib_dataset, processor, n_samples)
-    inputs = processor('\n\n'.join(samples_native['prompts']), return_tensors='pt')
-    samples = {
-        'prompts': [],
-        'raw_images': []
-    }
-    total_len = inputs.input_ids.shape[1]
-    seq_len = total_len // n_samples
-    for i in range(n_samples):
-        s = i * seq_len
-        e = (i + 1) * seq_len
-        token_ids = inputs.input_ids[:, s:e]
-        prompt = processor.decode(token_ids.squeeze(), skip_special_tokens=True)
-        prompt = prompt.replace('USER:', 'USER: <image>')
-        samples['prompts'].append(prompt)
-        samples['raw_images'].append(samples_native['raw_images'][i])
-    return samples
-
-
-@PREPROC_REGISTRY
-def vlm_clip_min(calib_dataset, processor, n_samples):
-    samples_native = vlm_native(calib_dataset, processor, n_samples)
-    samples = {
-        'prompts': [],
-        'raw_images': []
-    }
-    trainenc = [
-        processor(prompt, return_tensors='pt')
-        for prompt in samples_native['prompts']
-    ]
-    min_len = min(enc.input_ids.shape[1] for enc in trainenc)
-    for i in range(n_samples):
-        token_ids = trainenc[i].input_ids[:, :min_len]
-        prompt = processor.decode(token_ids.squeeze(), skip_special_tokens=True)
-        prompt = prompt.replace('USER:', 'USER: <image>')
-        samples['prompts'].append(prompt)
-        samples['raw_images'].append(samples_native['raw_images'][i])
-    return samples
-
-
-@PREPROC_REGISTRY
-def img_sampler(calib_dataset, processor, n_samples):
-    random.shuffle(calib_dataset)
-    samples = []
-    n_run = 0
-    for image in calib_dataset:
-        inp = processor(images=image, return_tensors='pt')
-        samples.append(inp)
-        n_run += 1
-        if n_run == n_samples:
-            break
-    return samples
+    return samples, None
 
 
 @PREPROC_REGISTRY
@@ -184,15 +110,3 @@ def random_truncate_txt(calib_dataset, tokenizer, n_samples, seq_len):
         inp = trainenc.input_ids[:, i:j]
         samples.append(inp)
     return samples
-
-
-@PREPROC_REGISTRY
-def original_txt(calib_dataset, tokenizer, n_samples, seq_len=None):
-    random.shuffle(calib_dataset)
-    n_samples = min(n_samples, len(calib_dataset))
-    samples = []
-    for i in range(n_samples):
-        trainenc = tokenizer(calib_dataset[i], return_tensors='pt')
-        inp = trainenc.input_ids
-        samples.append(inp)
-    return samples
diff --git a/llmc/data/dataset/train_dataset.py b/llmc/data/dataset/train_dataset.py
new file mode 100644
index 000000000..a1f03cfed
--- /dev/null
+++ b/llmc/data/dataset/train_dataset.py
@@ -0,0 +1,62 @@
+import torch
+from loguru import logger
+
+
+class TrainJsonDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset, tokenizer, block_size) -> None:
+        raw_data = dataset
+        self.tokenizer = tokenizer
+        self.block_size = block_size
+        tokenized_datasets = []
+        for d in raw_data:
+            tokenized_datasets.append(self.tokenize_function(d))
+
+        grouped_dataset = self.group_texts(tokenized_datasets)
+        self.input_ids = grouped_dataset['input_ids']
+        self.labels = grouped_dataset['labels']
+        self.data = [
+            dict(input_ids=self.input_ids[i], labels=self.labels[i])
+            for i in range(len(self.input_ids))
+        ]
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, i):
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def tokenize_function(self, examples):
+        return self.tokenizer(examples['text'])
+
+    def group_texts(self, examples):
+        # Concatenate all texts.
+        # Initialize an empty dictionary
+        concatenated_examples = {}
+
+        # Loop through the list of dictionaries
+        for d in examples:
+            # Loop through the keys in each dictionary
+            for key in d.keys():
+                # If the key is not already a key in the dict_of_lists, create a new list
+                if key not in concatenated_examples:
+                    concatenated_examples[key] = []
+                # Append the value to the list associated with the key in dict_of_lists
+                concatenated_examples[key].extend(d[key])
+        total_length = len(concatenated_examples['input_ids'])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= self.block_size:
+            total_length = (total_length // self.block_size) * self.block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [
+                t[i : i + self.block_size]
+                for i in range(0, total_length, self.block_size)
+            ]
+            for k, t in concatenated_examples.items()
+        }
+        result['labels'] = result['input_ids'].copy()
+        return result
diff --git a/llmc/data/tokenizer/base_tokenizer.py b/llmc/data/tokenizer/base_tokenizer.py
index f179fdbb7..c7d7429d2 100644
--- a/llmc/data/tokenizer/base_tokenizer.py
+++ b/llmc/data/tokenizer/base_tokenizer.py
@@ -1,4 +1,3 @@
-import warnings
 from abc import ABCMeta
 
 from transformers import AutoTokenizer
@@ -18,13 +17,9 @@ def __str__(self):
         return str(self.tokenizer)
 
     def build_tokenizer(self):
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.tokenizer_path, use_fast=self.use_fast, trust_remote_code=True
-            )
-        except Exception as e:
-            self.tokenizer = None
-            warnings.warn(f'Failed to load tokenizer. Error: {str(e)}')
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_path, use_fast=self.use_fast, trust_remote_code=True
+        )
 
     def get_tokenizer(self):
         return self.tokenizer
diff --git a/llmc/eval/__init__.py b/llmc/eval/__init__.py
index 7fd4c3b60..88563ee5b 100644
--- a/llmc/eval/__init__.py
+++ b/llmc/eval/__init__.py
@@ -1,3 +1 @@
-from .eval_acc import AccuracyEval
 from .eval_ppl import PerplexityEval
-from .eval_token_consist import TokenConsistencyEval
diff --git a/llmc/eval/eval_ppl.py b/llmc/eval/eval_ppl.py
index 9b07e5b45..c925acdb8 100644
--- a/llmc/eval/eval_ppl.py
+++ b/llmc/eval/eval_ppl.py
@@ -6,13 +6,98 @@
 from datasets import load_dataset, load_from_disk
 from loguru import logger
 
-from .eval_base import BaseEval
 
+class PerplexityEval:
+    def __init__(self, tokenizer, eval_cfg):
+        self.tokenizer = tokenizer
+        # eval_cfg
+        logger.info(f'eval_cfg : {eval_cfg}')
+        self.dataset = eval_cfg['name']
+        assert self.dataset in [
+            'wikitext2',
+            'c4',
+            'ptb',
+        ], 'Ppl eval only support wikitext2, c4, ptb dataset now.'
+        self.seq_len = eval_cfg['seq_len']
+        self.bs = eval_cfg['bs']
+        self.path = eval_cfg.get('path', None)
+        self.download = eval_cfg['download']
+        self.inference_per_block = eval_cfg.get('inference_per_block', False)
+        self.testenc = self.build_data()
 
-class PerplexityEval(BaseEval):
+    @torch.no_grad()
+    def build_data(self):
+        # load data
+        if self.download:
+            if self.dataset == 'wikitext2':
+                testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+            elif self.dataset == 'c4':
+                testdata = load_dataset(
+                    'allenai/c4',
+                    data_files={
+                        'validation': 'en/c4-validation.00000-of-00008.json.gz'
+                    },
+                    split='validation',
+                )
+            elif self.dataset == 'ptb':
+                testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+        else:
+            assert self.path, 'Please set path in eval_cfg.'
+            testdata = load_from_disk(self.path)
+
+        # encode data
+        if self.dataset == 'wikitext2':
+            testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt')
+        elif self.dataset == 'c4':
+            testenc = self.tokenizer(
+                ' '.join(testdata[:1100]['text']), return_tensors='pt'
+            )
+            testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)]
+        elif self.dataset == 'ptb':
+            testenc = self.tokenizer(
+                ' '.join(testdata['sentence']), return_tensors='pt'
+            )
+        return testenc
+
+    @torch.no_grad()
+    def eval(self, model_llmc):
+        model = model_llmc.get_model()
+        if self.inference_per_block:
+            handles = []
+            for layer in model_llmc.get_blocks():
+                handles.append(layer.register_forward_pre_hook(self.forward_pre_hook))
+            for layer in model_llmc.get_blocks():
+                handles.append(layer.register_forward_hook(self.forward_hook))
+            for layer in model_llmc.get_layers_except_blocks():
+                layer.cuda()
+        else:
+            model.cuda()
+
+        model.eval()
+        ppl = self.eval_ppl_func(model, self.testenc, self.seq_len, self.bs)
+        if self.inference_per_block:
+            for h in handles:
+                h.remove()
+        model.cpu()
+        gc.collect()
+        torch.cuda.empty_cache()
+        return ppl
+
+    @torch.no_grad()
+    def forward_pre_hook(self, m, x):
+        m.cuda()
+
+    @torch.no_grad()
+    def forward_hook(self, m, x, y):
+        with ThreadPoolExecutor() as executor:
+            executor.submit(self.load_layer_to_cpu, m)
+
+    @torch.no_grad()
+    def load_layer_to_cpu(self, m):
+        m.cpu()
 
     @torch.no_grad()
-    def eval_func(self, org_model, model, testenc, seq_len, bs):
+    def eval_ppl_func(self, model, testenc, seq_len, bs):
         testenc = testenc.input_ids
         nsamples = testenc.numel() // seq_len
 
@@ -73,7 +158,7 @@ def eval_func(self, org_model, model, testenc, seq_len, bs):
     parser.add_argument('--model_path', type=str, required=True)
     args = parser.parse_args()
 
-    tokenizer = BaseTokenizer(args.model_path, tokenizer_mode='fast')
+    tokenizer = BaseTokenizer(args.model_path)
     model = MODEL_REGISTRY[args.model_type](args.model_path, 'auto')
 
     # Llama2-70B config example
diff --git a/llmc/eval/eval_token.py b/llmc/eval/eval_token.py
new file mode 100644
index 000000000..21ead8cc7
--- /dev/null
+++ b/llmc/eval/eval_token.py
@@ -0,0 +1,185 @@
+import gc
+from concurrent.futures import ThreadPoolExecutor
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset, load_from_disk
+from loguru import logger
+
+
+class TokenConsistencyEval:
+    def __init__(self, tokenizer, eval_cfg):
+        self.tokenizer = tokenizer
+        # eval_cfg
+        logger.info(f'eval_cfg : {eval_cfg}')
+        self.dataset = eval_cfg['name']
+        assert self.dataset in [
+            'wikitext2',
+            'c4',
+            'ptb',
+        ], 'Token consistency eval only supports wikitext2, c4, ptb datasets now.'
+        self.seq_len = eval_cfg['seq_len']
+        self.bs = eval_cfg['bs']
+        self.path = eval_cfg.get('path', None)
+        self.download = eval_cfg['download']
+        self.inference_per_block = eval_cfg.get('inference_per_block', False)
+        self.testenc = self.build_data()
+
+    @torch.no_grad()
+    def build_data(self):
+        # load data
+        if self.download:
+            if self.dataset == 'wikitext2':
+                testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+            elif self.dataset == 'c4':
+                testdata = load_dataset(
+                    'allenai/c4',
+                    data_files={
+                        'validation': 'en/c4-validation.00000-of-00008.json.gz'
+                    },
+                    split='validation',
+                )
+            elif self.dataset == 'ptb':
+                testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+        else:
+            assert self.path, 'Please set path in eval_cfg.'
+            testdata = load_from_disk(self.path)
+
+        # encode data
+        if self.dataset == 'wikitext2':
+            testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt')
+        elif self.dataset == 'c4':
+            testenc = self.tokenizer(
+                ' '.join(testdata[:1100]['text']), return_tensors='pt'
+            )
+            testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)]
+        elif self.dataset == 'ptb':
+            testenc = self.tokenizer(
+                ' '.join(testdata['sentence']), return_tensors='pt'
+            )
+        return testenc
+
+    @torch.no_grad()
+    def eval(self, model_llmc_1, model_llmc_2):
+        model1 = model_llmc_1.get_model()
+        model2 = model_llmc_2.get_model()
+
+        if self.inference_per_block:
+            handles1 = []
+            handles2 = []
+            for layer in model_llmc_1.get_blocks():
+                handles1.append(layer.register_forward_pre_hook(self.forward_pre_hook))
+                handles1.append(layer.register_forward_hook(self.forward_hook))
+            for layer in model_llmc_2.get_blocks():
+                handles2.append(layer.register_forward_pre_hook(self.forward_pre_hook))
+                handles2.append(layer.register_forward_hook(self.forward_hook))
+            for layer in model_llmc_1.get_layers_except_blocks():
+                layer.cuda()
+            for layer in model_llmc_2.get_layers_except_blocks():
+                layer.cuda()
+        else:
+            model1.cuda()
+            model2.cuda()
+
+        model1.eval()
+        model2.eval()
+
+        consistency = self.eval_token_consistency(model1, model2, self.testenc, self.seq_len, self.bs)
+
+        if self.inference_per_block:
+            for h in handles1 + handles2:
+                h.remove()
+
+        model1.cpu()
+        model2.cpu()
+        gc.collect()
+        torch.cuda.empty_cache()
+        return consistency
+
+    @torch.no_grad()
+    def forward_pre_hook(self, m, x):
+        m.cuda()
+
+    @torch.no_grad()
+    def forward_hook(self, m, x, y):
+        with ThreadPoolExecutor() as executor:
+            executor.submit(self.load_layer_to_cpu, m)
+
+    @torch.no_grad()
+    def load_layer_to_cpu(self, m):
+        m.cpu()
+
+    @torch.no_grad()
+    def eval_token_consistency(self, model1, model2, testenc, seq_len, bs):
+        testenc = testenc.input_ids
+        nsamples = testenc.numel() // seq_len
+
+        consistent_tokens = 0
+        total_tokens = 0
+
+        # Loop through each batch
+        for i in range(0, nsamples, bs):
+            logger.info(f'index : {(i + 1) // bs}/{nsamples // bs}')
+            # Calculate end index
+            j = min(i + bs, nsamples)
+
+            # Prepare inputs and move to gpu
+            inputs = testenc[:, (i * seq_len): (j * seq_len)].cuda()
+            inputs = inputs.reshape(j - i, seq_len)
+
+            # Forward pass through the models
+            logits1 = model1(inputs).logits
+            logits2 = model2(inputs).logits
+
+            # Get predicted tokens
+            preds1 = torch.argmax(logits1, dim=-1)
+            preds2 = torch.argmax(logits2, dim=-1)
+
+            # Compare tokens for consistency
+            consistent_tokens += (preds1 == preds2).sum().item()
+            total_tokens += preds1.numel()
+
+        # Calculate consistency ratio
+        consistency_ratio = consistent_tokens / total_tokens
+
+        # Empty CUDA cache to save memory
+        testenc.cpu()
+        torch.cuda.empty_cache()
+
+        return consistency_ratio
+
+
+if __name__ == '__main__':
+    import sys
+
+    sys.path.append('../../')
+    import argparse
+
+    from llmc.data import BaseTokenizer
+    from llmc.models import Llama
+    from llmc.utils.registry_factory import MODEL_REGISTRY
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_type_1', type=str, required=True)
+    parser.add_argument('--model_path_1', type=str, required=True)
+    parser.add_argument('--model_type_2', type=str, required=True)
+    parser.add_argument('--model_path_2', type=str, required=True)
+    args = parser.parse_args()
+
+    tokenizer = BaseTokenizer(args.model_path_1, tokenizer_mode='slow')
+    model1 = MODEL_REGISTRY[args.model_type_1](args.model_path_1, 'auto')
+    model2 = MODEL_REGISTRY[args.model_type_2](args.model_path_2, 'auto')
+
+    # Llama2-70B config example
+    eval_cfg = {
+        'name': 'wikitext2',
+        'seq_len': 2048,
+        'bs': 1,
+        'download': False,
+        'path': '/home/gushiqiao/nvme/gushiqiao/llm_datasets/eval/wikitext2',
+        'inference_per_block': False,
+    }
+    token_consistency_eval = TokenConsistencyEval(tokenizer.get_tokenizer(), eval_cfg)
+
+    consistency_ratio = token_consistency_eval.eval(model1, model2)
+    logger.info(f'Token consistency ratio: {consistency_ratio}')
diff --git a/llmc/models/__init__.py b/llmc/models/__init__.py
index 9e498f1e3..0fbf1d53f 100644
--- a/llmc/models/__init__.py
+++ b/llmc/models/__init__.py
@@ -1,22 +1,11 @@
 from .bloom import Bloom
-from .deepseekv2 import DeepseekV2
 from .falcon import Falcon
 from .gemma2 import Gemma2
 from .internlm2 import InternLM2
-from .internomni import InternOmni
-from .internvl2 import InternVL2
 from .llama import Llama
 from .llava import Llava
-from .minicpm import MiniCPM
 from .mistral import Mistral
 from .mixtral import Mixtral
 from .opt import Opt
-from .phi import Phi
-from .qwen import Qwen
 from .qwen2 import Qwen2
-from .qwen2moe import Qwen2Moe
-from .qwenvl import QwenVL
-from .smollm import SmolLM
-from .stablelm import StableLm
 from .starcoder import Starcoder
-from .vit import Vit
diff --git a/llmc/models/base_model.py b/llmc/models/base_model.py
index 1695043a9..b3a34be42 100644
--- a/llmc/models/base_model.py
+++ b/llmc/models/base_model.py
@@ -17,19 +17,14 @@
 
 
 class BaseModel(metaclass=ABCMeta):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
+    def __init__(self, model_path, torch_dtype):
         self.model_path = model_path
         self.torch_dtype = torch_dtype if torch_dtype == 'auto' else eval(torch_dtype)
-        self.device_map = device_map
-        self.use_cache = use_cache
-        self.vlm_model = None
-        self.processor = None
         self.build_model()
         self.model.eval()
         self.find_blocks()
         self.find_embed_layers()
         self.find_block_name()
-        self.add_layernorms_class()
 
     @abstractmethod
     def find_blocks(self):
@@ -60,17 +55,10 @@ def get_layers_except_blocks(self):
     def get_subsets_in_block(self, block):
         pass
 
-    @abstractmethod
-    def skip_layer_name(self):
-        pass
-
     @abstractmethod
     def has_bias(self):
         pass
 
-    def get_attention_rotary_layers(self):
-        return []
-
     def __str__(self):
         return f'\nConfig: \n{str(self.model_config)} \nModel: \n{str(self.model)}'
 
@@ -78,34 +66,19 @@ def build_model(self):
         self.model_config = AutoConfig.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        if not self.use_cache:
-            if hasattr(self.model_config, 'use_cache'):
-                self.model_config.use_cache = False
+        if hasattr(self.model_config, 'use_cache'):
+            self.model_config.use_cache = False
         logger.info(f'self.model_config : {self.model_config}')
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             config=self.model_config,
-            device_map=self.device_map,
             trust_remote_code=True,
             torch_dtype=self.torch_dtype,
             low_cpu_mem_usage=True,
         )
 
-    def add_layernorms_class(self):
-        ln_class_list = []
-        single_block = self.get_blocks()[0]
-        ln_dict = self.get_layernorms_in_block(single_block)
-        for ln_name in ln_dict:
-            ln_class = ln_dict[ln_name].__class__
-            if ln_class not in ln_class_list:
-                ln_class_list.append(ln_class)
-        for ln_class in ln_class_list:
-            if ln_class not in _TRANSFORMERS_LN_TYPES_:
-                _TRANSFORMERS_LN_TYPES_.append(ln_class)
-        logger.info(f'_TRANSFORMERS_LN_TYPES_ : {_TRANSFORMERS_LN_TYPES_}')
-
     @torch.no_grad()
-    def collect_first_block_input(self, calib_data, data_type='txt'):
+    def collect_first_block_input(self, calib_data):
         first_block_input = defaultdict(list)
 
         class Catcher(nn.Module):
@@ -122,38 +95,15 @@ def forward(self, inp, **kwargs):
                 raise ValueError
 
         self.move_embed_to_device('cuda')
-        if data_type == 'img_txt':
-            self.vision_tower = self.vision_tower.to('cuda')
-            self.multi_modal_projector = self.multi_modal_projector.to('cuda')
         self.blocks[0] = self.blocks[0].cuda()
         self.blocks[0] = Catcher(self.blocks[0])
 
         for data in calib_data:
             try:
-                if data_type == 'txt':
-                    data = {
-                        k: v.to(next(self.model.parameters()).device)
-                        for k, v in data.items()
-                    }
-                    self.model(**data)
-                elif data_type == 'img':
-                    data = {
-                        k: v.to(next(self.model.parameters()).device)
-                        for k, v in data.items()
-                    }
-                    self.model(**data)
-                elif data_type == 'img_txt':
-                    data = {
-                        k: v.to(next(self.model.parameters()).device)
-                        for k, v in data.items()
-                    }
-                    self.vlm_model.generate(**data, max_new_tokens=200, do_sample=False)
+                self.model(data.to(next(self.model.parameters()).device))
             except ValueError:
                 pass
         self.first_block_input = first_block_input
-        if data_type == 'img_txt':
-            self.vision_tower = self.vision_tower.cpu()
-            self.multi_modal_projector = self.multi_modal_projector.cpu()
         self.blocks[0] = self.blocks[0].module
         self.blocks[0] = self.blocks[0].cpu()
         self.move_embed_to_device('cpu')
@@ -166,9 +116,7 @@ def get_model_config(self):
 
     def move_embed_to_device(self, device):
         for embed_layer in self.get_embed_layers():
-            embed_layer.to(device)
-        for attention_rotary_layer in self.get_attention_rotary_layers():
-            attention_rotary_layer.to(device)
+            embed_layer = embed_layer.to(device)
 
     def get_block_linears(self, block):
         return {
@@ -177,9 +125,6 @@ def get_block_linears(self, block):
             if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_))
         }
 
-    def get_extra_modules(self, block):
-        return {}
-
     def set_mix_bits_params_dict(self, block_idx, name, params_dict):
 
         logger.info('set_mix_bits_params_dict')
@@ -241,16 +186,13 @@ def set_mix_bits_params_dict(self, block_idx, name, params_dict):
             params_mix_dict['a_qdq'] = None
         return params_mix_dict
 
-    def replace_module_all(self, module, params_dict, keep_device=False):
+    def replace_module_all(self, module, params_dict):
         for block_idx in range(len(self.blocks)):
             logger.info(f'Replace block index: {block_idx}/{len(self.blocks)}')
             block = self.blocks[block_idx]
-            if keep_device:
-                self.replace_module_block(module, block, block_idx, params_dict)
-            else:
-                block = block.cuda()
-                self.replace_module_block(module, block, block_idx, params_dict)
-                block = block.cpu()
+            block = block.cuda()
+            self.replace_module_block(module, block, block_idx, params_dict)
+            block = block.cpu()
 
         gc.collect()
         torch.cuda.empty_cache()
diff --git a/llmc/models/bloom.py b/llmc/models/bloom.py
index 16980a87c..34b0e9eae 100644
--- a/llmc/models/bloom.py
+++ b/llmc/models/bloom.py
@@ -5,8 +5,8 @@
 
 @MODEL_REGISTRY
 class Bloom(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.transformer.h
@@ -31,9 +31,6 @@ def get_layers_except_blocks(self):
             self.model.transformer.ln_f,
         ]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return True
 
diff --git a/llmc/models/falcon.py b/llmc/models/falcon.py
index 90be4f2c2..8c9cef614 100644
--- a/llmc/models/falcon.py
+++ b/llmc/models/falcon.py
@@ -5,15 +5,14 @@
 
 @MODEL_REGISTRY
 class Falcon(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.transformer.h
 
     def find_embed_layers(self):
         self.word_embeddings = self.model.transformer.word_embeddings
-        self.rotary_emb = self.model.model.rotary_emb
 
     def find_block_name(self):
         self.block_name_prefix = 'model.transformer.h'
@@ -21,11 +20,8 @@ def find_block_name(self):
     def get_embed_layers(self):
         return [self.word_embeddings]
 
-    def get_attention_rotary_layers(self):
-        return [self.rotary_emb]
-
     def get_layers_except_blocks(self):
-        return [self.word_embeddings, self.rotary_emb, self.model.transformer.ln_f]
+        return [self.word_embeddings, self.model.transformer.ln_f]
 
     def has_bias(self):
         return False
diff --git a/llmc/models/gemma2.py b/llmc/models/gemma2.py
index 402a66153..b4696f921 100644
--- a/llmc/models/gemma2.py
+++ b/llmc/models/gemma2.py
@@ -1,34 +1,12 @@
-from loguru import logger
-
 from llmc.utils.registry_factory import MODEL_REGISTRY
 
-try:
-    from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm
-except Exception:
-    logger.warning('Gemma2 not found')
-from types import MethodType
-
-import torch.nn as nn
-
 from .base_model import BaseModel
 
 
-def gemma2_rms_norm_forward(self, x):
-    output = self._norm(x.float())
-    output = output * self.weight.float()
-    return output.type_as(x)
-
-
 @MODEL_REGISTRY
 class Gemma2(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
-        for m in self.model.modules():
-            if isinstance(m, Gemma2RMSNorm):
-                w = m.weight.data
-                del m.weight
-                m.weight = nn.Parameter(w + 1.0)
-                m.forward = MethodType(gemma2_rms_norm_forward, m)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
@@ -43,18 +21,9 @@ def find_block_name(self):
     def get_embed_layers(self):
         return [self.embed_tokens]
 
-    def get_head_layers(self):
-        return [self.model.lm_head]
-
-    def get_pre_head_layernorm_layers(self):
-        return [self.model.model.norm]
-
     def get_layers_except_blocks(self):
         return [self.embed_tokens, self.model.model.norm, self.model.lm_head]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return False
 
@@ -93,7 +62,6 @@ def get_subsets_in_block(self, block):
                 'input': ['mlp.gate_proj'],
                 'inspect': block.mlp,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
             {
                 'layers': {'mlp.down_proj': block.mlp.down_proj},
@@ -101,6 +69,5 @@ def get_subsets_in_block(self, block):
                 'input': ['mlp.down_proj'],
                 'inspect': block.mlp.down_proj,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
         ]
diff --git a/llmc/models/internlm2.py b/llmc/models/internlm2.py
index 39f1b57eb..5e17c0d90 100644
--- a/llmc/models/internlm2.py
+++ b/llmc/models/internlm2.py
@@ -1,4 +1,3 @@
-from llmc.compression.quantization.module_utils import _TRANSFORMERS_LN_TYPES_
 from llmc.utils.registry_factory import MODEL_REGISTRY
 
 from .base_model import BaseModel
@@ -6,10 +5,8 @@
 
 @MODEL_REGISTRY
 class InternLM2(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
-        global _TRANSFORMERS_LN_TYPES_
-        _TRANSFORMERS_LN_TYPES_ += [type(self.model.model.norm)]
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
@@ -23,18 +20,9 @@ def find_block_name(self):
     def get_embed_layers(self):
         return [self.tok_embeddings]
 
-    def get_head_layers(self):
-        return [self.model.output]
-
-    def get_pre_head_layernorm_layers(self):
-        return [self.model.model.norm]
-
     def get_layers_except_blocks(self):
         return [self.tok_embeddings, self.model.model.norm, self.model.output]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return False
 
@@ -69,7 +57,6 @@ def get_subsets_in_block(self, block):
                 'input': ['feed_forward.w1'],
                 'inspect': block.feed_forward,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
             {
                 'layers': {'feed_forward.w2': block.feed_forward.w2},
@@ -77,6 +64,5 @@ def get_subsets_in_block(self, block):
                 'input': ['feed_forward.w2'],
                 'inspect': block.feed_forward.w2,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
         ]
diff --git a/llmc/models/llama.py b/llmc/models/llama.py
index 35e62bc86..10da9a38e 100644
--- a/llmc/models/llama.py
+++ b/llmc/models/llama.py
@@ -5,25 +5,21 @@
 
 @MODEL_REGISTRY
 class Llama(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
 
     def find_embed_layers(self):
         self.embed_tokens = self.model.model.embed_tokens
-        self.rotary_emb = self.model.model.rotary_emb
 
     def find_block_name(self):
         self.block_name_prefix = 'model.layers'
         self.pairs = {'q_proj': 'qkv', 'o_proj': 'out', 'up_proj': 'fc1'}
 
     def get_embed_layers(self):
-        return [self.embed_tokens]
-
-    def get_attention_rotary_layers(self):
-        return [self.rotary_emb]
+        return [self.model.model.embed_tokens]
 
     def get_head_layers(self):
         return [self.model.lm_head]
@@ -32,10 +28,7 @@ def get_pre_head_layernorm_layers(self):
         return [self.model.model.norm]
 
     def get_layers_except_blocks(self):
-        return [self.embed_tokens, self.rotary_emb, self.model.model.norm, self.model.lm_head] # noqa
-
-    def skip_layer_name(self):
-        return ['lm_head']
+        return [self.embed_tokens, self.model.model.norm, self.model.lm_head]
 
     def has_bias(self):
         return False
diff --git a/llmc/models/llava.py b/llmc/models/llava.py
index e6c31ea6f..bc230241b 100644
--- a/llmc/models/llava.py
+++ b/llmc/models/llava.py
@@ -6,7 +6,7 @@
 from .llama import Llama
 
 try:
-    from transformers import AutoProcessor, LlavaForConditionalGeneration
+    from transformers import LlavaForConditionalGeneration
 except Exception:
     logger.info(
         'LlavaForConditionalGeneration is not supported in this version of transfomers.'
@@ -16,24 +16,18 @@
 
 @MODEL_REGISTRY
 class Llava(Llama):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def build_model(self):
-        self.vlm_model_config = AutoConfig.from_pretrained(
+        self.model_config = AutoConfig.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        if not self.use_cache:
-            self.vlm_model_config.text_config.use_cache = False
-        logger.info(f'self.vlm_model_config : {self.vlm_model_config}')
-        self.vlm_model = LlavaForConditionalGeneration.from_pretrained(
+        self.model_config.text_config.use_cache = False
+        self.llava_model = LlavaForConditionalGeneration.from_pretrained(
             self.model_path,
-            config=self.vlm_model_config,
+            config=self.model_config,
             torch_dtype=self.torch_dtype,
             low_cpu_mem_usage=True,
         )
-        self.vision_tower = self.vlm_model.vision_tower
-        self.multi_modal_projector = self.vlm_model.multi_modal_projector
-        self.processor = AutoProcessor.from_pretrained(self.model_path)
-        self.model = self.vlm_model.language_model
-        self.model_config = self.vlm_model_config.text_config
+        self.model = self.llava_model.language_model
diff --git a/llmc/models/mistral.py b/llmc/models/mistral.py
index 7689b4092..be18a0b4e 100644
--- a/llmc/models/mistral.py
+++ b/llmc/models/mistral.py
@@ -5,8 +5,8 @@
 
 @MODEL_REGISTRY
 class Mistral(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
@@ -24,9 +24,6 @@ def get_embed_layers(self):
     def get_layers_except_blocks(self):
         return [self.embed_tokens, self.model.model.norm, self.model.lm_head]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return False
 
diff --git a/llmc/models/mixtral.py b/llmc/models/mixtral.py
index fec0fcdb5..a94583101 100644
--- a/llmc/models/mixtral.py
+++ b/llmc/models/mixtral.py
@@ -5,8 +5,8 @@
 
 @MODEL_REGISTRY
 class Mixtral(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
@@ -23,9 +23,6 @@ def get_embed_layers(self):
     def get_layers_except_blocks(self):
         return [self.embed_tokens, self.model.model.norm, self.model.lm_head]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return False
 
@@ -35,11 +32,6 @@ def get_layernorms_in_block(self, block):
             'post_attention_layernorm': block.post_attention_layernorm,
         }
 
-    def get_extra_modules(self, block):
-        return {
-            'block_sparse_moe': block.block_sparse_moe
-        }
-
     def get_subsets_in_block(self, block):
         return [
             {
@@ -61,25 +53,11 @@ def get_subsets_in_block(self, block):
                 'has_kwargs': False,
             },
             {
-                'layers': {
-                    **{f'block_sparse_moe.experts.{i}.w1': block.block_sparse_moe.experts[i].w1 for i in range(len(block.block_sparse_moe.experts))}, # noqa
-                    **{f'block_sparse_moe.experts.{i}.w3': block.block_sparse_moe.experts[i].w3 for i in range(len(block.block_sparse_moe.experts))}, # noqa
-                },
+                'layers': {'block_sparse_moe.gate': block.block_sparse_moe.gate},
                 'prev_op': [block.post_attention_layernorm],
-                'input': ['block_sparse_moe'],
-                'inspect': block.block_sparse_moe,
+                'input': ['block_sparse_moe.gate'],
+                'inspect': block.block_sparse_moe.gate,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
-            *[
-                {
-                    'layers': {f'block_sparse_moe.experts.{i}.w2': block.block_sparse_moe.experts[i].w2}, # noqa
-                    'prev_op': [block.block_sparse_moe.experts[i].w3],
-                    'input': [f'block_sparse_moe.experts.{i}.w2'],
-                    'inspect': block.block_sparse_moe.experts[i].w2,
-                    'has_kwargs': False,
-                    'is_mlp': True,
-                }
-                for i in range(len(block.block_sparse_moe.experts))
-            ],
+            # Moe layers can not transform.
         ]
diff --git a/llmc/models/opt.py b/llmc/models/opt.py
index 71e2f2114..95ac30aed 100644
--- a/llmc/models/opt.py
+++ b/llmc/models/opt.py
@@ -5,8 +5,8 @@
 
 @MODEL_REGISTRY
 class Opt(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.decoder.layers
@@ -38,9 +38,6 @@ def get_layers_except_blocks(self):
             layers.append(self.model.model.decoder.final_layer_norm)
         return layers
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return True
 
diff --git a/llmc/models/qwen2.py b/llmc/models/qwen2.py
index 25840decd..d260fcfa2 100644
--- a/llmc/models/qwen2.py
+++ b/llmc/models/qwen2.py
@@ -5,15 +5,14 @@
 
 @MODEL_REGISTRY
 class Qwen2(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.model.layers
 
     def find_embed_layers(self):
         self.embed_tokens = self.model.model.embed_tokens
-        self.rotary_emb = self.model.model.rotary_emb
 
     def find_block_name(self):
         self.block_name_prefix = 'model.layers'
@@ -22,20 +21,8 @@ def find_block_name(self):
     def get_embed_layers(self):
         return [self.embed_tokens]
 
-    def get_attention_rotary_layers(self):
-        return [self.rotary_emb]
-
-    def get_head_layers(self):
-        return [self.model.lm_head]
-
-    def get_pre_head_layernorm_layers(self):
-        return [self.model.model.norm]
-
     def get_layers_except_blocks(self):
-        return [self.embed_tokens, self.rotary_emb, self.model.model.norm, self.model.lm_head] # noqa
-
-    def skip_layer_name(self):
-        return ['lm_head']
+        return [self.embed_tokens, self.model.model.norm, self.model.lm_head]
 
     def has_bias(self):
         return False
@@ -75,7 +62,6 @@ def get_subsets_in_block(self, block):
                 'input': ['mlp.gate_proj'],
                 'inspect': block.mlp,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
             {
                 'layers': {'mlp.down_proj': block.mlp.down_proj},
@@ -83,6 +69,5 @@ def get_subsets_in_block(self, block):
                 'input': ['mlp.down_proj'],
                 'inspect': block.mlp.down_proj,
                 'has_kwargs': False,
-                'is_mlp': True,
             },
         ]
diff --git a/llmc/models/starcoder.py b/llmc/models/starcoder.py
index 0a97f9b63..be4d8bc30 100644
--- a/llmc/models/starcoder.py
+++ b/llmc/models/starcoder.py
@@ -5,8 +5,8 @@
 
 @MODEL_REGISTRY
 class Starcoder(BaseModel):
-    def __init__(self, model_path, torch_dtype, device_map=None, use_cache=False):
-        super().__init__(model_path, torch_dtype, device_map, use_cache)
+    def __init__(self, model_path, torch_dtype):
+        super().__init__(model_path, torch_dtype)
 
     def find_blocks(self):
         self.blocks = self.model.transformer.h
@@ -29,9 +29,6 @@ def get_layers_except_blocks(self):
             self.model.lm_head,
         ]
 
-    def skip_layer_name(self):
-        return ['lm_head']
-
     def has_bias(self):
         return True
 
diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py
index 574b27176..bad8b4d1d 100644
--- a/llmc/utils/__init__.py
+++ b/llmc/utils/__init__.py
@@ -1,4 +1 @@
-from .export_autoawq import update_autoawq_quant_config
-from .export_vllm import update_vllm_quant_config
-from .utils import (check_config, copy_files, mkdirs,
-                    print_important_package_version, seed_all)
+from .utils import check_config, copy_files, mkdirs, seed_all
diff --git a/llmc/utils/utils.py b/llmc/utils/utils.py
index 7bc1f0d5a..4c198b1ce 100644
--- a/llmc/utils/utils.py
+++ b/llmc/utils/utils.py
@@ -57,11 +57,6 @@ def check_weight_setting(weight_setting):
         config.model.tokenizer_mode = 'slow'
         logger.info('Tokenizer_mode is set to slow.')
 
-    if 'calib' in config and not config.calib.get('type', False):
-        config.calib.type = 'txt'
-    if 'eval' in config and not config.eval.get('type', False):
-        config.eval.type = 'ppl'
-
 
 def mkdirs(path):
     if not os.path.exists(path):
@@ -77,12 +72,3 @@ def copy_files(source_dir, target_dir, substring):
             target_file = os.path.join(target_dir, filename)
             shutil.copy(source_file, target_file)
             logger.info(f'Copied {filename} to {target_dir}')
-
-
-def print_important_package_version():
-    from importlib.metadata import version
-    logger.info(f"torch : {version('torch')}")
-    logger.info(f"transformers : {version('transformers')}")
-    logger.info(f"tokenizers : {version('tokenizers')}")
-    logger.info(f"huggingface-hub : {version('huggingface-hub')}")
-    logger.info(f"datasets : {version('datasets')}")
diff --git a/lm-evaluation-harness b/lm-evaluation-harness
deleted file mode 160000
index 86fd4ad29..000000000
--- a/lm-evaluation-harness
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 86fd4ad29b1eb168cd1c86dd37d8eb6a93ee67d2
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 89203076e..7c6153e81 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,27 +1,10 @@
-torch>=2.1.0
-torchvision
-timm
-pillow
+torch
 loguru
-transformers==4.45.2
+transformers==4.44.2
+accelerate==0.31.0
+datasets==2.20.0
 huggingface-hub
 sentencepiece
 protobuf
-accelerate>=0.26.0
 zstandard
 easydict
-evaluate>=0.4.0
-datasets>=2.16.0
-jsonlines
-numexpr
-peft>=0.2.0
-pybind11>=2.6.2
-pytablewriter
-rouge-score>=0.0.4
-sacrebleu>=1.5.0
-scikit-learn>=0.24.1
-sqlitedict
-tqdm-multiprocess
-dill
-word2number
-more_itertools
diff --git a/scripts/run_adadim_llama.sh b/scripts/run_adadim_llama.sh
new file mode 100644
index 000000000..28e2a4ba0
--- /dev/null
+++ b/scripts/run_adadim_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/AdaDim/adadim_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_awq_llama.sh b/scripts/run_awq_llama.sh
new file mode 100644
index 000000000..3d638583d
--- /dev/null
+++ b/scripts/run_awq_llama.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/Awq/awq_w4a16_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+
diff --git a/scripts/run_dgq_llama.sh b/scripts/run_dgq_llama.sh
new file mode 100644
index 000000000..aa3c109be
--- /dev/null
+++ b/scripts/run_dgq_llama.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/DGQ/dgq_w4a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+
diff --git a/scripts/run_gptq_llama.sh b/scripts/run_gptq_llama.sh
new file mode 100644
index 000000000..05e2609d5
--- /dev/null
+++ b/scripts/run_gptq_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/GPTQ/gptq_quarot.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_gptq_owq_llama.sh b/scripts/run_gptq_owq_llama.sh
new file mode 100644
index 000000000..7e0f6d22c
--- /dev/null
+++ b/scripts/run_gptq_owq_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/GPTQ/gptq_owq_w4a16_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_hqq_llama.sh b/scripts/run_hqq_llama.sh
new file mode 100644
index 000000000..7f995c9a0
--- /dev/null
+++ b/scripts/run_hqq_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/HQQ/hqq_w4a16_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_in_tmux_sequence.sh b/scripts/run_in_tmux_sequence.sh
new file mode 100644
index 000000000..6534e1ae5
--- /dev/null
+++ b/scripts/run_in_tmux_sequence.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+
+task_name=rtn_w8a8_fakequant_eval
+echo "${task_name} running..."
+python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 
+
+
+task_name=smoothquant_llama_w8a8_fakequant_eval_general
+echo "${task_name} running..."
+python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml \
+> ${task_name}.log 2>&1 
+
+
+task_name=osplus_llama_w8a8_fakequant_eval_general
+echo "${task_name} running..."
+python -m llmc --config ../configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml \
+> ${task_name}.log 2>&1 
diff --git a/scripts/run_llmint8_llama.sh b/scripts/run_llmint8_llama.sh
new file mode 100644
index 000000000..a4261cb6d
--- /dev/null
+++ b/scripts/run_llmint8_llama.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/LlmInt8/llmint8_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+
diff --git a/scripts/run_ntweak_llama.sh b/scripts/run_ntweak_llama.sh
new file mode 100644
index 000000000..b94e260ae
--- /dev/null
+++ b/scripts/run_ntweak_llama.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/NormTweaking/ntweak_llama_w4a16_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+
diff --git a/scripts/run_omniq_llama.sh b/scripts/run_omniq_llama.sh
new file mode 100644
index 000000000..5f7241a61
--- /dev/null
+++ b/scripts/run_omniq_llama.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/OmniQuant/omniq_llama_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
+
diff --git a/scripts/run_omniq_mistral.sh b/scripts/run_omniq_mistral.sh
new file mode 100644
index 000000000..0164521af
--- /dev/null
+++ b/scripts/run_omniq_mistral.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/OmniQuant/omniq_mistral_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
diff --git a/scripts/run_omniq_opt.sh b/scripts/run_omniq_opt.sh
new file mode 100644
index 000000000..2e0da4b4e
--- /dev/null
+++ b/scripts/run_omniq_opt.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/OmniQuant/omniq_opt_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_osplus_llama.sh b/scripts/run_osplus_llama.sh
new file mode 100644
index 000000000..983364620
--- /dev/null
+++ b/scripts/run_osplus_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/OsPlus/osplus_llama_w8a8_fakequant_eval_general.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
diff --git a/scripts/run_osplus_opt.sh b/scripts/run_osplus_opt.sh
new file mode 100644
index 000000000..37f666150
--- /dev/null
+++ b/scripts/run_osplus_opt.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/OsPlus/osplus_opt_w8a8_fakequant_eval_general.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
diff --git a/scripts/run_quarot_llama.sh b/scripts/run_quarot_llama.sh
new file mode 100644
index 000000000..760a7c5c1
--- /dev/null
+++ b/scripts/run_quarot_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=1
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/QuaRot/quarot_w4a4.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_quik_llama.sh b/scripts/run_quik_llama.sh
new file mode 100644
index 000000000..818069d8b
--- /dev/null
+++ b/scripts/run_quik_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/QUIK/quik_w4a4_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_rtn_llama.sh b/scripts/run_rtn_llama.sh
new file mode 100644
index 000000000..8d328a7f4
--- /dev/null
+++ b/scripts/run_rtn_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_rtn_llama_static.sh b/scripts/run_rtn_llama_static.sh
new file mode 100644
index 000000000..cc7e62da4
--- /dev/null
+++ b/scripts/run_rtn_llama_static.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/RTN/rtn_w8a8_pertensor_static.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_shortgpt_llama.sh b/scripts/run_shortgpt_llama.sh
new file mode 100644
index 000000000..f56c090ae
--- /dev/null
+++ b/scripts/run_shortgpt_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/sparsification/ShortGPT/shortgpt.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_smoothquant_llama.sh b/scripts/run_smoothquant_llama.sh
new file mode 100644
index 000000000..6715d68e0
--- /dev/null
+++ b/scripts/run_smoothquant_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_llama_w8a8_fakequant_eval_general.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
diff --git a/scripts/run_smoothquant_opt.sh b/scripts/run_smoothquant_opt.sh
new file mode 100644
index 000000000..38f7b616d
--- /dev/null
+++ b/scripts/run_smoothquant_opt.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/SmoothQuant/smoothquant_opt_w8a8_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
diff --git a/scripts/run_spinquant_llama.sh b/scripts/run_spinquant_llama.sh
new file mode 100644
index 000000000..240858f88
--- /dev/null
+++ b/scripts/run_spinquant_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/SpinQuant/spinquant_w4a4.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_spqr_llama.sh b/scripts/run_spqr_llama.sh
new file mode 100644
index 000000000..270c61611
--- /dev/null
+++ b/scripts/run_spqr_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/quantization/SpQR/spqr_w4a16_fakequant_eval.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/scripts/run_wanda_llama.sh b/scripts/run_wanda_llama.sh
new file mode 100644
index 000000000..96b31c518
--- /dev/null
+++ b/scripts/run_wanda_llama.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+gpu_id=0
+export CUDA_VISIBLE_DEVICES=$gpu_id
+
+llmc=llmc_path
+export PYTHONPATH=$llmc:$PYTHONPATH
+
+task_name=llm_quant_exp
+
+nohup \
+python -m llmc --config ../configs/sparsification/Wand/wanda.yml \
+> ${task_name}.log 2>&1 &
+
+echo $! > ${task_name}.pid
\ No newline at end of file
diff --git a/tools/outlier_analysis.py b/tools/outlier_analysis.py
new file mode 100644
index 000000000..474eaf7ce
--- /dev/null
+++ b/tools/outlier_analysis.py
@@ -0,0 +1,483 @@
+import argparse
+import functools
+import gc
+import os
+import sys
+
+import torch
+from loguru import logger
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM
+
+sys.path.append('..')
+import matplotlib.pyplot as plt
+import torch.nn as nn
+
+from llmc.compression.quantization import FakeQuantLinear, Quantizer
+from llmc.compression.quantization.module_utils import (
+    _LLMC_LINEAR_TYPES_, _TRANSFORMERS_LINEAR_TYPES_, RotateLinear)
+from llmc.data import BaseDataset, BaseTokenizer
+from llmc.models import *
+from llmc.utils import check_config, mkdirs, seed_all
+from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY
+
+
+def calculate_kurtosis_channel(signal):
+    """Calculates the kurtosis of a given signal.
+
+    Args:
+        signal (torch.Tensor): Input signal, shape (4096, 1024).
+
+    Returns:
+        float: The average kurtosis value of the rows.
+    """
+    signal = signal.float()
+    mean = torch.mean(signal, dim=1, keepdim=True)
+    std = torch.std(signal, dim=1, keepdim=True)
+
+    std[std == 0] = 1e-8  # Avoid division by zero
+
+    standardized_signal = (signal - mean) / std
+    kurtosis = torch.mean(
+        standardized_signal**4, dim=1
+    )  # Calculate kurtosis for each row
+
+    average_kurtosis = torch.mean(kurtosis)
+
+    return average_kurtosis.item()
+
+
+def calculate_kurtosis(signal):
+    """Calculates the kurtosis of a given signal.
+
+    Args:
+        signal (torch.Tensor): Input signal, shape (N, *).
+
+    Returns:
+        float: The kurtosis value.
+    """
+    signal = signal.float()
+    signal = signal.view(1, -1)
+    mean = torch.mean(signal)
+    std = torch.std(signal)
+
+    if std == 0:
+        return float('inf')
+
+    standardized_signal = (signal - mean) / (std + 1e-8)
+
+    kurtosis = torch.mean(standardized_signal**4)  # - 3
+
+    return kurtosis.item()
+
+
+def draw(save_path, save_name, X, Y1, Y2):
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1)
+    ax.plot(X, Y1)
+    ax.plot(X, Y2)
+    plt.xlabel('channel')
+    plt.ylabel('value')
+    plt.title(save_name)
+    fig.savefig(f'{save_path}/{save_name}.jpg')
+    plt.close(fig)
+    plt.cla()
+
+
+def analysis_block_cosine(res, t_res, args):
+    cosine_sim = nn.CosineSimilarity()
+
+    for name in res:
+        oups = res[name]
+        t_oups = t_res[name]
+
+        layer_cosine_dict = {}
+        for j in range(oups.shape[0]):
+            cos = cosine_sim(oups[j].float().view(1, -1), t_oups[j].float().view(1, -1))
+
+            if name not in layer_cosine_dict:
+                layer_cosine_dict[name] = []
+
+            layer_cosine_dict[name].append(cos.item())
+
+        for name in layer_cosine_dict:
+            cos_values = layer_cosine_dict[name]
+            min_cos = min(cos_values)
+            avg_cos = sum(cos_values) / len(cos_values)
+            logger.info(name)
+            logger.info(f'min_cos : {min_cos}')
+            logger.info(f'avg_cos : {avg_cos}')
+
+
+def avg_k_a(a, k):
+    result = (a[:, None] * k[None, :]).sum(dim=0)
+
+    total_sum = result.sum()
+    print(result.shape)
+
+    average = total_sum / result.numel()
+    return average
+
+
+def analysis_block_outlier(res, t_res, org_w, trans_w, arg):
+    if args.prof_gra in ['per_channel', 'per_group']:
+        kurt_func = calculate_kurtosis_channel
+    else:
+        kurt_func = calculate_kurtosis
+
+    for name in res:
+        logger.info(name)
+
+        weight = org_w[name]
+        t_weight = trans_w[name]
+
+        if args.prof_gra == 'per_group':
+            weight = wquanter.reshape_tensor(weight)
+            t_weight = wquanter.reshape_tensor(t_weight)
+
+        k_w = kurt_func(weight)
+        k_t_w = kurt_func(t_weight)
+
+        logger.info(f'The kurtosis of org weight is :{k_w}')
+        logger.info(f'The kurtosis of trans weight is :{k_t_w}')
+
+        tensor = res[name].mean(dim=0)
+        tensor = tensor.float()
+
+        t_tensor = t_res[name].mean(dim=0)
+        t_tensor = t_tensor.float()
+
+        k_a = kurt_func(tensor)
+        k_t_a = kurt_func(t_tensor)
+
+        logger.info(f'The kurtosis of org act is :{k_a}')
+        logger.info(f'The kurtosis of trans act is :{k_t_a}')
+
+        if args.draw:
+            save_outlier_path = os.path.join(args.save_path, 'outlier')
+            save_t_outlier_path = os.path.join(args.save_path, 't_outlier')
+
+            t_min_val = t_tensor.amin(dim=0).detach().cpu().numpy()
+            t_max_val = t_tensor.amax(dim=0).detach().cpu().numpy()
+
+            min_val = tensor.amin(dim=0).detach().cpu().numpy()
+            max_val = tensor.amax(dim=0).detach().cpu().numpy()
+
+            if not os.path.exists(args.save_path):
+                mkdirs(save_outlier_path)
+                mkdirs(save_t_outlier_path)
+
+            draw(
+                save_path=save_outlier_path,
+                save_name=name,
+                X=range(tensor.shape[-1]),
+                Y1=min_val,
+                Y2=max_val,
+            )
+
+            draw(
+                save_path=save_t_outlier_path,
+                save_name=name,
+                X=range(t_tensor.shape[-1]),
+                Y1=t_min_val,
+                Y2=t_max_val,
+            )
+
+
+def register_hook(block, idx, args):
+    hooks = []
+    for name, m in block.named_modules():
+        if not args.cosine:
+            if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)):
+                hooks.append(
+                    m.register_forward_hook(
+                        functools.partial(
+                            stat_input_hook,
+                            w=m.weight.data,
+                            name=name,
+                            idx=idx,
+                            args=args,
+                        )
+                    )
+                )
+        else:
+            if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)):
+                hooks.append(
+                    m.register_forward_hook(
+                        functools.partial(
+                            stat_output_hook, name=name, idx=idx, args=args
+                        )
+                    )
+                )
+
+    return hooks
+
+
+def stat_input_hook(m, x, y, w, name, idx, args):
+    if isinstance(x, tuple):
+        x = x[0]
+
+    layer_name = f'block_{idx}.{name}'
+
+    if args.online_rotate and t:
+        if 'down_proj' in layer_name:
+            x = down_rotater.rotate(x)
+        elif 'o_proj' in layer_name:
+            x = o_rotater.rotate(x)
+
+    if t:
+        t_res[layer_name] = x
+        trans_w[layer_name] = w
+    else:
+        res[layer_name] = x
+        org_w[layer_name] = w
+
+
+def stat_output_hook(m, x, y, name, idx, args):
+    if isinstance(y, tuple):
+        y = y[0]
+    layer_name = f'block_{idx}.{name}'
+    if t:
+        t_res[layer_name] = y
+    else:
+        res[layer_name] = y
+
+
+def block_forward(block, input_data, input_kwargs):
+    output = []
+
+    for i in range(len(input_data)):
+        input_data[i] = input_data[i].to(
+            device=next(block.parameters()).device,
+            dtype=next(block.parameters()).dtype,
+        )
+        if (
+            'attention_mask' in input_kwargs[i]
+            and input_kwargs[i]['attention_mask'] is not None
+        ):
+            input_kwargs[i]['attention_mask'] = input_kwargs[i]['attention_mask'].cuda()
+        with torch.no_grad():
+            out = block(input_data[i], **input_kwargs[i])[0]
+            output.append(out)
+    return output
+
+
+class analysis_quanter(Quantizer):
+    def __init__(self, bit, symmetric, granularity, **kwargs):
+        super().__init__(bit, symmetric, granularity, **kwargs)
+
+    def fake_quant_weight_dynamic(self, module, args={}):
+        weight = module.weight
+        if 'int_indices' in args:
+            if self.granularity == 'per_group':
+                assert len(args['int_indices']) % self.group_size == 0
+            q_weight = weight[:, args['int_indices']]
+            fp_weight = weight[:, args['fp_indices']]
+
+        elif 'dim' in args and 'ic' in args['dim']:
+            q_weight = weight.T
+        else:
+            q_weight = weight
+
+        if 'current_bit' in args:
+            org_bit = self.bit
+            self.bit = args['current_bit']
+
+        org_w_shape = q_weight.shape
+        org_w_dtype = q_weight.dtype
+        q_weight, scales, zeros, max_int, min_int = self.get_tensor_qparams(
+            q_weight, args
+        )
+
+        q_weight = self.quant_dequant(q_weight, scales, zeros, max_int, min_int)
+        q_weight = self.restore_tensor(q_weight, org_w_shape).to(org_w_dtype)
+
+        if 'current_bit' in args:
+            self.bit = org_bit
+
+        if 'int_indices' in args:
+            mix_weight = torch.zeros_like(weight)
+            mix_weight[:, args['int_indices']] = q_weight
+            mix_weight[:, args['fp_indices']] = fp_weight
+            return mix_weight
+
+        elif 'dim' in args and 'ic' in args['dim']:
+            q_weight = q_weight.T
+
+        return q_weight
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_name', type=str)
+    parser.add_argument('--data_path', type=str)
+    parser.add_argument('--n_samples', type=int, default=128)
+    parser.add_argument('--bs', type=int, default=-1)
+    parser.add_argument('--seq_len', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--preproc', type=str, default='general')
+    parser.add_argument('--save_path', type=str, default='./save')
+    parser.add_argument('--draw', action='store_true')
+    parser.add_argument('--cosine', action='store_true')
+    parser.add_argument('--model_type', type=str, required=True)
+    parser.add_argument('--model_path', type=str, required=True)
+    parser.add_argument('--t_model_path', type=str)
+    parser.add_argument('--torch_dtype', type=str, default='auto')
+    parser.add_argument('--tokenizer_mode', type=str, default='slow')
+
+    parser.add_argument('--w_only', action='store_true')
+    parser.add_argument('--wbit', type=int, default=6)
+    parser.add_argument('--wsym', action='store_true')
+    parser.add_argument('--wgra', type=str, default='per_channel')
+    parser.add_argument('--group_size', type=int, default=-1)
+
+    parser.add_argument('--abit', type=int, default=6)
+    parser.add_argument('--asym', action='store_true')
+    parser.add_argument('--agra', type=str, default='per_token')
+
+    parser.add_argument('--log_dir', type=str, default='log.txt')
+    parser.add_argument('--prof_gra', type=str, default='per_tensor')
+    parser.add_argument('--config_path', type=str)
+
+    parser.add_argument('--online_rotate', action='store_true')
+
+    args = parser.parse_args()
+
+    seed_all(args.seed)
+
+    logger.remove()
+    logger.add(args.log_dir, level='INFO', mode='w')
+
+    logger.info(f'args : {args}')
+
+    calib_cfg = {
+        'name': args.dataset_name,
+        'download': False,
+        'path': args.data_path,
+        'n_samples': args.n_samples,
+        'bs': args.bs,
+        'seq_len': args.seq_len,
+        'preproc': args.preproc,
+        'seed': args.seed,
+    }
+
+    model_config = {
+        'type': args.model_type,
+        'path': args.model_path,
+        'torch_dtype': args.torch_dtype,
+    }
+
+    model = MODEL_REGISTRY[args.model_type](args.model_path, args.torch_dtype)
+
+    t_model = MODEL_REGISTRY[args.model_type](args.t_model_path, args.torch_dtype)
+
+    if args.online_rotate:
+        # import gc
+
+        import yaml
+        from easydict import EasyDict
+
+        with open(args.config_path, 'r') as file:
+            config = yaml.safe_load(file)
+        config = EasyDict(config)
+
+        tokenizer = BaseTokenizer(args.model_path, args.tokenizer_mode)
+        dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib)
+        calib_data = dataset.get_calib_dataset()
+        t_model.collect_first_block_input(calib_data)
+        del calib_data
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        blockwise_opt = ALGO_REGISTRY[config.quant.method](
+            t_model, config.quant, t_model.get_first_block_input(), config
+        )
+        blockwise_opt.run_block_loop()
+        t_model = blockwise_opt.model
+
+        for n, m in t_model.model.named_modules():
+            if isinstance(m, RotateLinear):
+                logger.info(m)
+                if 'down_proj' in n:
+                    down_rotater = m.rotater
+                else:
+                    o_rotater = m.rotater
+
+    logger.info(t_model)
+    logger.info(model)
+
+    tokenizer = BaseTokenizer(args.model_path, args.tokenizer_mode)
+    dataset = BaseDataset(tokenizer.get_tokenizer(), calib_cfg)
+
+    calib_data = dataset.get_calib_dataset()
+
+    model.collect_first_block_input(calib_data)
+    t_model.collect_first_block_input(calib_data)
+
+    fp_inps = model.get_first_block_input()
+    t_fp_inps = t_model.get_first_block_input()
+
+    res = {}
+    t_res = {}
+
+    org_w = {}
+    trans_w = {}
+
+    wquanter = analysis_quanter(
+        bit=args.wbit,
+        symmetric=args.wsym,
+        granularity=args.wgra,
+        group_size=args.group_size,
+    )
+
+    if not args.w_only:
+        aquanter = Quantizer(bit=args.abit, symmetric=args.asym, granularity=args.agra)
+
+        def a_qdq(act, module=None):
+            return aquanter.fake_quant_act_dynamic(act)
+
+    if args.cosine:
+        params_dict = {}
+        params_dict['w_qdq'] = wquanter.fake_quant_weight_dynamic
+        params_dict['a_qdq'] = None if args.w_only else a_qdq
+        t_model.replace_module_all(FakeQuantLinear, params_dict)
+
+    with torch.no_grad():
+        for i in tqdm(range(len(model.blocks))):
+            block = model.blocks[i]
+            t_block = t_model.blocks[i]
+            block.cuda()
+            t_block.cuda()
+
+            t_hooks = register_hook(t_block, i, args)
+            t = True
+            t_fp_inps['data'] = block_forward(
+                t_block, t_fp_inps['data'], t_fp_inps['kwargs']
+            )
+
+            hooks = register_hook(block, i, args)
+            t = False
+            fp_inps['data'] = block_forward(block, fp_inps['data'], fp_inps['kwargs'])
+
+            block.cpu()
+
+            t_block.cpu()
+
+            for h in hooks:
+                h.remove()
+
+            for t_h in t_hooks:
+                t_h.remove()
+
+            if args.cosine:
+                analysis_block_cosine(res, t_res, args)
+            else:
+                analysis_block_outlier(res, t_res, org_w, trans_w, args)
+
+            res.clear()
+            t_res.clear()
+            org_w.clear()
+            trans_w.clear()
+
+            gc.collect()
+            torch.cuda.empty_cache()
diff --git a/tools/token_analysis.py b/tools/token_analysis.py
new file mode 100644
index 000000000..48fd3dcd7
--- /dev/null
+++ b/tools/token_analysis.py
@@ -0,0 +1,185 @@
+import gc
+from concurrent.futures import ThreadPoolExecutor
+
+import torch
+import torch.nn as nn
+from datasets import load_dataset, load_from_disk
+from loguru import logger
+
+
+class TokenConsistencyEval:
+    def __init__(self, tokenizer, eval_cfg):
+        self.tokenizer = tokenizer
+        # eval_cfg
+        logger.info(f'eval_cfg : {eval_cfg}')
+        self.dataset = eval_cfg['name']
+        assert self.dataset in [
+            'wikitext2',
+            'c4',
+            'ptb',
+        ], 'Token consistency eval only supports wikitext2, c4, ptb datasets now.'
+        self.seq_len = eval_cfg['seq_len']
+        self.bs = eval_cfg['bs']
+        self.path = eval_cfg.get('path', None)
+        self.download = eval_cfg['download']
+        self.inference_per_block = eval_cfg.get('inference_per_block', False)
+        self.testenc = self.build_data()
+
+    @torch.no_grad()
+    def build_data(self):
+        # load data
+        if self.download:
+            if self.dataset == 'wikitext2':
+                testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+            elif self.dataset == 'c4':
+                testdata = load_dataset(
+                    'allenai/c4',
+                    data_files={
+                        'validation': 'en/c4-validation.00000-of-00008.json.gz'
+                    },
+                    split='validation',
+                )
+            elif self.dataset == 'ptb':
+                testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+        else:
+            assert self.path, 'Please set path in eval_cfg.'
+            testdata = load_from_disk(self.path)
+
+        # encode data
+        if self.dataset == 'wikitext2':
+            testenc = self.tokenizer('\n\n'.join(testdata['text']), return_tensors='pt')
+        elif self.dataset == 'c4':
+            testenc = self.tokenizer(
+                ' '.join(testdata[:1100]['text']), return_tensors='pt'
+            )
+            testenc.input_ids = testenc.input_ids[:, : (256 * self.seq_len)]
+        elif self.dataset == 'ptb':
+            testenc = self.tokenizer(
+                ' '.join(testdata['sentence']), return_tensors='pt'
+            )
+        return testenc
+
+    @torch.no_grad()
+    def eval(self, model_llmc_1, model_llmc_2):
+        model1 = model_llmc_1.get_model()
+        model2 = model_llmc_2.get_model()
+
+        if self.inference_per_block:
+            handles1 = []
+            handles2 = []
+            for layer in model_llmc_1.get_blocks():
+                handles1.append(layer.register_forward_pre_hook(self.forward_pre_hook))
+                handles1.append(layer.register_forward_hook(self.forward_hook))
+            for layer in model_llmc_2.get_blocks():
+                handles2.append(layer.register_forward_pre_hook(self.forward_pre_hook))
+                handles2.append(layer.register_forward_hook(self.forward_hook))
+            for layer in model_llmc_1.get_layers_except_blocks():
+                layer.cuda()
+            for layer in model_llmc_2.get_layers_except_blocks():
+                layer.cuda()
+        else:
+            model1.cuda()
+            model2.cuda()
+
+        model1.eval()
+        model2.eval()
+
+        consistency = self.eval_token_consistency(model1, model2, self.testenc, self.seq_len, self.bs)
+
+        if self.inference_per_block:
+            for h in handles1 + handles2:
+                h.remove()
+
+        model1.cpu()
+        model2.cpu()
+        gc.collect()
+        torch.cuda.empty_cache()
+        return consistency
+
+    @torch.no_grad()
+    def forward_pre_hook(self, m, x):
+        m.cuda()
+
+    @torch.no_grad()
+    def forward_hook(self, m, x, y):
+        with ThreadPoolExecutor() as executor:
+            executor.submit(self.load_layer_to_cpu, m)
+
+    @torch.no_grad()
+    def load_layer_to_cpu(self, m):
+        m.cpu()
+
+    @torch.no_grad()
+    def eval_token_consistency(self, model1, model2, testenc, seq_len, bs):
+        testenc = testenc.input_ids
+        nsamples = testenc.numel() // seq_len
+
+        consistent_tokens = 0
+        total_tokens = 0
+
+        # Loop through each batch
+        for i in range(0, nsamples, bs):
+            logger.info(f'index : {(i + 1) // bs}/{nsamples // bs}')
+            # Calculate end index
+            j = min(i + bs, nsamples)
+
+            # Prepare inputs and move to gpu
+            inputs = testenc[:, (i * seq_len): (j * seq_len)].cuda()
+            inputs = inputs.reshape(j - i, seq_len)
+
+            # Forward pass through the models
+            logits1 = model1(inputs).logits
+            logits2 = model2(inputs).logits
+
+            # Get predicted tokens
+            preds1 = torch.argmax(logits1, dim=-1)
+            preds2 = torch.argmax(logits2, dim=-1)
+
+            # Compare tokens for consistency
+            consistent_tokens += (preds1 == preds2).sum().item()
+            total_tokens += preds1.numel()
+
+        # Calculate consistency ratio
+        consistency_ratio = consistent_tokens / total_tokens
+
+        # Empty CUDA cache to save memory
+        testenc.cpu()
+        torch.cuda.empty_cache()
+
+        return consistency_ratio
+
+
+if __name__ == '__main__':
+    import sys
+
+    sys.path.append('../../')
+    import argparse
+
+    from llmc.data import BaseTokenizer
+    from llmc.models import Llama
+    from llmc.utils.registry_factory import MODEL_REGISTRY
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_type_1', type=str, required=True)
+    parser.add_argument('--model_path_1', type=str, required=True)
+    parser.add_argument('--model_type_2', type=str, required=True)
+    parser.add_argument('--model_path_2', type=str, required=True)
+    args = parser.parse_args()
+
+    tokenizer = BaseTokenizer(args.model_path_1)
+    model1 = MODEL_REGISTRY[args.model_type_1](args.model_path_1, 'auto')
+    model2 = MODEL_REGISTRY[args.model_type_2](args.model_path_2, 'auto')
+
+    # Llama2-70B config example
+    eval_cfg = {
+        'name': 'wikitext2',
+        'seq_len': 2048,
+        'bs': 20,
+        'download': False,
+        'path': 'data_path',
+        'inference_per_block': True,
+    }
+    token_consistency_eval = TokenConsistencyEval(tokenizer.get_tokenizer(), eval_cfg)
+
+    consistency_ratio = token_consistency_eval.eval(model1, model2)
+    logger.info(f'Token consistency ratio: {consistency_ratio}')

From 7994f92e73afc74793a411095172e08eea029064 Mon Sep 17 00:00:00 2001
From: gushiqiao <77222802+gushiqiao@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:04:15 +0800
Subject: [PATCH 2/5] Update spinquant_w4a4.yml

---
 configs/quantization/SpinQuant/spinquant_w4a4.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/quantization/SpinQuant/spinquant_w4a4.yml b/configs/quantization/SpinQuant/spinquant_w4a4.yml
index 0609839d7..6eb46852d 100644
--- a/configs/quantization/SpinQuant/spinquant_w4a4.yml
+++ b/configs/quantization/SpinQuant/spinquant_w4a4.yml
@@ -53,7 +53,7 @@ train:
         weight_decay: 0.
         lr_scheduler_type: "cosine"
         gradient_checkpointing: True
-        max_steps: 1
+        max_steps: 100
         output_dir: output_path
         logging_dir: your_log_path
         logging_first_step: True

From b7173a77a7a27484a11026618328d913adbe0969 Mon Sep 17 00:00:00 2001
From: gushiqiao <77222802+gushiqiao@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:21:24 +0800
Subject: [PATCH 3/5] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 81be260a6..9797f5aea 100644
--- a/README.md
+++ b/README.md
@@ -283,6 +283,8 @@ You can add your own model type referring to files under `llmc/models/*.py`.
 
 ✅ [QuaRot](https://arxiv.org/abs/2404.00456)
 
+✅ [SpinQuant](https://arxiv.org/abs/2405.16406)
+
 ### Pruning
 
 ✅ Naive(Magnitude)

From 8736fe0d1fd92710dc1851db4beb50146aa08c68 Mon Sep 17 00:00:00 2001
From: gushiqiao <77222802+gushiqiao@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:21:58 +0800
Subject: [PATCH 4/5] Update README_ja.md

---
 README_ja.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_ja.md b/README_ja.md
index b093271a8..03ba168a3 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -267,6 +267,8 @@
 
 ✅ [QuaRot](https://arxiv.org/abs/2404.00456)
 
+✅ [SpinQuant](https://arxiv.org/abs/2405.16406)
+
 ### 剪定
 
 ✅ Naive(Magnitude)

From 15e4a45ca105c84f47e81cb0be6cb93afd60e365 Mon Sep 17 00:00:00 2001
From: gushiqiao <77222802+gushiqiao@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:22:17 +0800
Subject: [PATCH 5/5] Update README_zh.md

---
 README_zh.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_zh.md b/README_zh.md
index 4732f561a..1f43d219d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -263,6 +263,8 @@
 
 ✅ [QuaRot](https://arxiv.org/abs/2404.00456)
 
+✅ [SpinQuant](https://arxiv.org/abs/2405.16406)
+
 ### 剪枝
 
 ✅ Naive(Magnitude)