|
地板
楼主 |
发表于 2025-2-1 16:36:24
|
只看该作者
本帖最后由 jefferyzhang 于 2025-2-11 14:37 编辑
DeepSeek-R1 RK35XX部署
( 官方demo已经更新: rknn-llm/examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo at main · airockchip/rknn-llm )
1. 系统RKNN环境搭建
* 下载rknn-llm : airockchip/rknn-llm
* 按其文档搭建rknn-llm/rknn-toolkit2环境
2. 下载DeepSeek-R1-1.5B HunggingFace 模型
* 新建一个目录,把这里所有文件下下来
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B at main
3. 编写转换脚本,放到deepseek模型目录里
- from rkllm.api import RKLLM
- from datasets import load_dataset
- from transformers import AutoTokenizer
- from tqdm import tqdm
- import torch
- from torch import nn
- import os
- # os.environ['CUDA_VISIBLE_DEVICES']='1'
- modelpath = '.'
- llm = RKLLM()
- # Load model
- # Use 'export CUDA_VISIBLE_DEVICES=2' to specify GPU device
- # options ['cpu', 'cuda']
- ret = llm.load_huggingface(model=modelpath, model_lora = None, device='cpu')
- # ret = llm.load_gguf(model = modelpath)
- if ret != 0:
- print('Load model failed!')
- exit(ret)
- # Build model
- dataset = "./data_quant.json"
- # Json file format, please note to add prompt in the input,like this:
- # [{"input":"Human: 你好!\nAssistant: ", "target": "你好!我是人工智能助手KK!"},...]
- qparams = None
- # qparams = 'gdq.qparams' # Use extra_qparams
- #ret = llm.build(do_quantization=True, optimization_level=1, quantized_dtype='w8a8',
- # quantized_algorithm='normal', target_platform='rk3588', num_npu_core=3, extra_qparams=qparams, dataset=dataset)
- ret = llm.build(do_quantization=True, optimization_level=1, quantized_dtype='w8a8',
- quantized_algorithm='normal', target_platform='rk3576', num_npu_core=2, extra_qparams=qparams, dataset=dataset)
- if ret != 0:
- print('Build model failed!')
- exit(ret)
- # Evaluate Accuracy
- def eval_wikitext(llm):
- seqlen = 512
- tokenizer = AutoTokenizer.from_pretrained(
- modelpath, trust_remote_code=True)
- # Dataset download link:
- # https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-2-raw-v1
- testenc = load_dataset(
- "parquet", data_files='./wikitext/wikitext-2-raw-1/test-00000-of-00001.parquet', split='train')
- testenc = tokenizer("\n\n".join(
- testenc['text']), return_tensors="pt").input_ids
- nsamples = testenc.numel() // seqlen
- nlls = []
- for i in tqdm(range(nsamples), desc="eval_wikitext: "):
- batch = testenc[:, (i * seqlen): ((i + 1) * seqlen)]
- inputs = {"input_ids": batch}
- lm_logits = llm.get_logits(inputs)
- if lm_logits is None:
- print("get logits failed!")
- return
- shift_logits = lm_logits[:, :-1, :]
- shift_labels = batch[:, 1:].to(lm_logits.device)
- loss_fct = nn.CrossEntropyLoss().to(lm_logits.device)
- loss = loss_fct(
- shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
- neg_log_likelihood = loss.float() * seqlen
- nlls.append(neg_log_likelihood)
- ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen))
- print(f'wikitext-2-raw-1-test ppl: {round(ppl.item(), 2)}')
- # eval_wikitext(llm)
- # Chat with model
- messages = "<|im_start|>system You are a helpful assistant.<|im_end|><|im_start|>user你好!\n<|im_end|><|im_start|>assistant"
- kwargs = {"max_length": 128, "top_k": 1, "top_p": 0.8,
- "temperature": 0.8, "do_sample": True, "repetition_penalty": 1.1}
- # print(llm.chat_model(messages, kwargs))
- # Export rkllm model
- ret = llm.export_rkllm("./deepseek-r1.rkllm")
- if ret != 0:
- print('Export model failed!')
- exit(ret)
4. 如果需要做精度评估,就按代码里下载数据集做精度评估,如果需要调整量化策略,就在上头相应修改。执行该脚本,就能得到转换后的模型 deepseek-r1.rknn
5. 开发板部署运行(编写基于rkllm_api的开发板程序)
- #include <cstdio>
- #include <cstdint>
- #include <cstdlib>
- #include <cstring>
- #include <string>
- #include <iostream>
- #include <fstream>
- #include <vector>
- #include <csignal>
- #include "rkllm.h"
- #define MODEL_PATH "/data/deepseek-r1_3588_w8a8.rkllm"
- #define PROMPT_TEXT_PREFIX "<|User|>"
- #define PROMPT_TEXT_POSTFIX "<|Assistant|>"
- LLMHandle llmHandle = nullptr;
- void exit_handler(int signal) {
- if (llmHandle != nullptr)
- {
- {
- std::cout << "程序即将退出" << std::endl;
- LLMHandle _tmp = llmHandle;
- llmHandle = nullptr;
- rkllm_destroy(_tmp);
- }
- }
- exit(signal);
- }
- void callback(RKLLMResult *result, void *userdata, LLMCallState state) {
- if (state == RKLLM_RUN_FINISH) {
- printf("\n");
- } else if (state == RKLLM_RUN_ERROR) {
- printf("\\run error\n");
- } else if (state == RKLLM_RUN_GET_LAST_HIDDEN_LAYER) {
- /* ================================================================================================================
- 若使用GET_LAST_HIDDEN_LAYER功能,callback接口会回传内存指针:last_hidden_layer,token数量:num_tokens与隐藏层大小:embd_size
- 通过这三个参数可以取得last_hidden_layer中的数据
- 注:需要在当前callback中获取,若未及时获取,下一次callback会将该指针释放
- ===============================================================================================================*/
- if (result->last_hidden_layer.embd_size != 0 && result->last_hidden_layer.num_tokens != 0) {
- int data_size = result->last_hidden_layer.embd_size * result->last_hidden_layer.num_tokens * sizeof(float);
- printf("\ndata_size:%d",data_size);
- std::ofstream outFile("last_hidden_layer.bin", std::ios::binary);
- if (outFile.is_open()) {
- outFile.write(reinterpret_cast<const char*>(result->last_hidden_layer.hidden_states), data_size);
- outFile.close();
- std::cout << "Data saved to output.bin successfully!" << std::endl;
- } else {
- std::cerr << "Failed to open the file for writing!" << std::endl;
- }
- }
- } else if (state == RKLLM_RUN_NORMAL) {
- printf("%s", result->text);
- }
- }
- int main() {
- signal(SIGINT, exit_handler);
- printf("rkllm init start\n");
- //设置参数及初始化
- RKLLMParam param = rkllm_createDefaultParam();
- param.model_path = MODEL_PATH;
- //设置采样参数
- param.top_k = 1;
- param.top_p = 0.95;
- param.temperature = 0.8;
- param.repeat_penalty = 1.1;
- param.frequency_penalty = 0.0;
- param.presence_penalty = 0.0;
- param.max_new_tokens = 128000;
- param.max_context_len = 128000;
- param.skip_special_token = true;
- param.extend_param.base_domain_id = 0;
- int ret = rkllm_init(&llmHandle, ¶m, callback);
- if (ret == 0){
- printf("rkllm init success\n");
- } else {
- printf("rkllm init failed\n");
- exit_handler(-1);
- }
- std::string text;
- RKLLMInput rkllm_input;
- // 初始化 infer 参数结构体
- RKLLMInferParam rkllm_infer_params;
- memset(&rkllm_infer_params, 0, sizeof(RKLLMInferParam)); // 将所有内容初始化为 0
- rkllm_infer_params.mode = RKLLM_INFER_GENERATE;
- while (true)
- {
- std::string input_str;
- printf("\n");
- printf("user: ");
- std::getline(std::cin, input_str);
- if (input_str == "exit")
- {
- break;
- }
- text = PROMPT_TEXT_PREFIX + input_str + PROMPT_TEXT_POSTFIX;
- rkllm_input.input_type = RKLLM_INPUT_PROMPT;
- rkllm_input.prompt_input = (char *)text.c_str();
- printf("robot: ");
- // 若要使用普通推理功能,则配置rkllm_infer_mode为RKLLM_INFER_GENERATE或不配置参数
- rkllm_run(llmHandle, &rkllm_input, &rkllm_infer_params, NULL);
- }
- rkllm_destroy(llmHandle);
- return 0;
- }
需要注意的是:
1. 如果你是copy自demo,请去掉场景提示词,deepseek不需要提示词。
2. 终端注意输入的汉字编码
3. RK3588 8bit量化内存需要8G或者8G以上,运行时DDR占用70%左右。
4. RK3576 4bit量化内存需要4G或者4G以上,运行时DDR占用80%左右。
|
|