backup

```
vllm serve /model/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 2048 \
  --gpu-memory-utilization 0.9 \
  --reasoning-parser qwen3 \
   --quantization awq

--quantization awq \
  --reasoning-parser qwen3 \
  --trust-remote-code

vllm serve /model/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 2048 \
  --gpu-memory-utilization 0.9 \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --tokenizer Qwen/Qwen3.5-27B \
  --cpu-offload-gb 35

vllm serve /model/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 2048 \
  --gpu-memory-utilization 0.85 \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --tokenizer Qwen/Qwen3.5-27B \
  --cpu-offload-gb 30 \
  --enforce-eager
  
vllm serve /model/Qwen3.5-27B-v2-AWQ \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.90 \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --dtype float16 \
  --tokenizer Qwen/Qwen3.5-27B
  
  
btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit
huggingface-cli download  btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit --local-dir /model/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit

vllm serve /model/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit \
  --served-model-name qwen3-coder \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.90 \
  --quantization gptq \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --trust-remote-code \
  --dtype float16 \
  --max-num-seqs 8 \
  --max-num-batched-tokens 32768
  
  
vllm serve /model/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit \
  --served-model-name qwen3-coder \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.90 \
  --quantization gptq_marlin \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --trust-remote-code \
  --dtype float16 
  
  
  curl http://127.0.0.1:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen27b",
    "messages": [
      {"role": "user", "content": "记住了马"}
    ],
    "max_tokens": 512
  }'

vllm serve /model/Qwen3.5-27B-v2-AWQ \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 2048 \
  --gpu-memory-utilization 0.90 \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --dtype float16 \
  --tokenizer Qwen/Qwen3.5-27B

clawhub login --token clh_TpSQ6_hU_jm7ISiMlghI8_0V88yzb3W0C2bNoXm6DLA

################可用
vllm serve /model/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.90 \
  --quantization gptq_marlin \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --trust-remote-code \
  --dtype float16 \
    --enable-auto-tool-choice \
  --tool-call-parser qwen3_coder
  
  vllm serve /model/Qwen3.5-27B-GPTQ-int4 \
  --served-model-name qwen27b \
  --host 0.0.0.0 \
  --port 8000 \
  --max-model-len 31360 \
  --gpu-memory-utilization 0.92 \
  --reasoning-parser qwen3 \
  --trust-remote-code \
  --dtype float16 \
  --enforce-eager \
  --max-num-seqs 4 \
    --enable-auto-tool-choice \
  --tool-call-parser hermes
  
  
~/llama.cpp/llama-server   -m /model/Qwen3.5-27B-v2-GGUF/*Q4_K_M*.gguf   --host 0.0.0.0   --port 8000   -ngl 99   -c 98304   --threads 8   -np 1   --metrics

# 安装llama.cpp
cd ~
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
  -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --target llama-server
cp llama.cpp/build/bin/llama-server llama.cpp/

# 下载 Q4_K_M
huggingface-cli download Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF \
  --include "*Q4_K_M*" \
  --local-dir /model/Qwen3.5-27B-v2-GGUF

# 启动（OpenAI 兼容接口）
~/llama.cpp/llama-server \
  -m /model/Qwen3.5-27B-v2-GGUF/*Q4_K_M*.gguf \
  --host 0.0.0.0 \
  --port 8000 \
  -ngl 99 \
  -c 32768 \
  --threads 8
  ```