jasper pytorch模型直接加载或转onnx或转tf模型均不成功

[复制链接] · 发表于 2020-3-14 17:28:27

本帖最后由 xsky 于 2020-3-14 17:30 编辑

rknn  v1.3.0
pytorch v1.2
tensorflow  v1.14onnx  v1.4.1
onnx-tf  v1.3.0

代码及模型
.pt是原始模型
链接：https://pan.baidu.com/s/10xjD5cuDKNkN_c-AWlxksg
提取码：1okw

1. .pt模型是可以正确运行的;
但rknn加载pytorch模型提示错误:

E Catch exception when loading pytorch model: toml_fp32_acoustic.pt!

E Traceback (most recent call last):

E File "rknn\api\rknn_base.py", line 567, in rknn.api.rknn_base.RKNNBase.load_pytorch

E File "rknn\base\RKNNlib\app\importer\import_pytorch.py", line 95, in rknn.base.RKNNlib.app.importer.import_pytorch.ImportPytorch.run

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 517, in rknn.base.RKNNlib.converter.convert_pytorch.convert_pytorch.__init__

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 601, in rknn.base.RKNNlib.converter.convert_pytorch.convert_pytorch.model_simplify

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 104, in rknn.base.RKNNlib.converter.convert_pytorch.torch_inference_engine.shape_pick

……   类似重复n次

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 104, in rknn.base.RKNNlib.converter.convert_pytorch.torch_inference_engine.shape_pick

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 139, in rknn.base.RKNNlib.converter.convert_pytorch.torch_inference_engine.__ir_shape_inference

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 251, in rknn.base.RKNNlib.converter.convert_pytorch.torch_inference_engine.convolution_shape

E File "rknn\base\RKNNlib\converter\convert_pytorch.py", line 242, in rknn.base.RKNNlib.converter.convert_pytorch.torch_inference_engine._pad_pick

E IndexError: list index out of range

Traceback (most recent call last):
复制代码

2. .onnx模型也可以正确运行,直接使用onnx-tf运行结果与.pt结果一致  (onnx-tf需要更新为v1.3.0)
  onnx转为.rknn 未量化, 可运行,但结果错误输出为全0
3.  通过onnx转为tf模型然后再转为.rknn, 运行提示模型错误失败:

--> Init runtime environment

rknn run target self

I npu_transfer_proxy pid: 748, status: sleeping

D NPUTransfer: efuseid = 3399:4632a3468cf39f3b

I NPUTransfer: Starting NPU Transfer Client, Transfer version 1.9.8 (cab3961@2019-12-12T09:54:26)

D NPUTransfer: Transfer spec = local:transfer_proxy

D NPUTransfer: Transfer interface successfully opened, fd = 4

E RKNNAPI: rknn_init, msg_load_ack fail, ack = 1(ACK_FAIL), expect 0(ACK_SUCC)!

E RKNNAPI: ==============================================

E RKNNAPI: RKNN VERSION:

E RKNNAPI: API: 1.3.0 (c5654ea build: 2019-12-25 12:40:55)

E RKNNAPI: DRV: 1.3.0 (c4f8c23 build: 2019-11-25 10:39:29)

E RKNNAPI: ==============================================

D NPUTransfer: Transfer client closed, fd = 4

E Catch exception when init runtime!

E Traceback (most recent call last):

E File "rknn/api/rknn_base.py", line 988, in rknn.api.rknn_base.RKNNBase.init_runtime

E File "rknn/api/rknn_runtime.py", line 320, in rknn.api.rknn_runtime.RKNNRuntime.build_graph

E Exception: RKNN init failed. error code: RKNN_ERR_MODEL_INVALID

Init runtime environment failed
复制代码

代码:



import platform

import os

import torch

import numpy as np



from rknn.api import RKNN

from timeit import default_timer as timer



import onnx

from onnx_tf.backend import prepare



def rm(file):

    if os.path.exists(file):

        os.remove(file)



if __name__ == '__main__':



    rknn = RKNN(verbose=True)

    rknn.config(batch_size=1,

                epochs=1)  # asymmetric_quantized-u8,  quantized_dtype='dynamic_fixed_point-16' , channel_mean_value='0 0 0 1', reorder_channel='0 1 2',

    system = platform.system()



    seq_len = 64

    isize = 672

    pt_file = 'toml_fp32_acoustic.pt'

    m = torch.jit.load(pt_file)

    m.eval()

    #x = torch.rand((1, seq_len, isize))

    x = torch.linspace(-1, 1, steps=(isize * seq_len))

    x = x.view(1, 1, seq_len, -1)

    input_size_list = [[1, 1, seq_len, isize]]

    #x1 = torch.full((1, hsize), fill_value=0.2, dtype=torch.float)

    ix = x.numpy()

    t0 = timer()

    out = m.forward(x)

    t1 = timer()

    print("torch out, time:", t1 - t0)

    print(out.detach().numpy())



    use_pytorch = False

    if use_pytorch:

        rknn.load_pytorch(model=pt_file, input_size_list=input_size_list)

    else:

        onnx_file = pt_file + '.onnx'

        #onnx_file = 'toml_fp32_acoustic.pt_op10.onnx'

        tf_model = onnx.load(onnx_file)

        tf_rep = prepare(tf_model)



        t0 = timer()

        tf_out = tf_rep.run(ix)

        t1 = timer()

        tf_out = np.array(tf_out)

        print("onnx out:", ix.shape, ', ', tf_out.shape, " time:", t1 - t0)

        print(tf_out)

        rknn.load_onnx(model=onnx_file)



        # onnx转为tf .pb并加载

        # pb_file = onnx_file + '.pb'

        # rm(pb_file)

        # tf_rep.export_graph(pb_file)

        # input_size_list = [[1, seq_len, isize]]

        # rknn.load_tensorflow(tf_pb=pb_file,

        #                      inputs=['input'],

        #                      outputs=['transpose_326'],

        #                      input_size_list=input_size_list)



    is_quant = False

    if is_quant:

        rknn_file = pt_file + '_quant.rknn'

    else:

        rknn_file = pt_file + '.rknn'



    print('--> Building model')

    ret = rknn.build(do_quantization=is_quant, dataset='./dataset.txt')

    if ret != 0:

        print('rknn.build failed!')

        exit(ret)

    print('build done')

    rm(rknn_file)

    rknn.export_rknn(rknn_file)



    print('--> Init runtime environment')

    if system == 'Windows':

        print("rknn run target rk1808")

        ret = rknn.init_runtime(target='rk1808', target_sub_class='AICS')

    else:

        print("rknn run target self")

        ret = rknn.init_runtime()

    if ret != 0:

        print('Init runtime environment failed')

        exit(ret)

    print('done')



    print('input:')

    print(ix)



    t0 = timer()

    torch_out = m.forward(x)

    t1 = timer()

    print("torch [out]:", x.shape, ', ', torch_out.shape, " time:", t1 - t0)

    print(torch_out.detach().numpy())



    t0 = timer()

    rknn_out = rknn.inference(inputs=[ix], data_type='float32', data_format='nchw')   # , inputs_pass_through=[1]

    t1 = timer()



    if rknn_out is None:

        print('rknn inference failed')

        exit(-100)

    y = np.array(rknn_out[0])

    print("rknn out:", ix.shape, ', ', y.shape, " time:", t1 - t0)

    print(y)



    rknn.release()

复制代码

只看该作者 · 发表于 2020-3-17 08:55:06

已上报问题

只看该作者 · 发表于 2020-3-21 18:09:23

jefferyzhang 发表于 2020-3-17 08:55
已上报问题

麻烦能不能帮问一下这个有什么头绪吗，现在着急评估项目，谢谢啊

拆开算子试了一下
.load_pytorch应该是对conv1d不支持

onnx可以正确运行conv1d
但
conv1d → relu → bn load_onnx运行结果正确
conv1d → bn → relu：pytorch/onnx输出shape为(1, 64, 672)但rknn输出shape为(1, 672, 64)
主楼中的模型就是类似这样的结构重复多次，这里输出shape会被改变，再往后传输的时候会错乱吧



import platform

import os

import torch

import numpy as np

import copy



from rknn.api import RKNN

from timeit import default_timer as timer



import onnx

from onnx_tf.backend import prepare



def rm(file):

    if os.path.exists(file):

        os.remove(file)





class MyConv1d(torch.nn.Module):

    def __init__(self, seq_len, isize):

        super(MyConv1d, self).__init__()



        self._conv1d = torch.nn.Conv1d(seq_len, seq_len, kernel_size=11, stride=1, padding=5, dilation=1, groups=1, bias=False)

        self._seq_len = seq_len

        self._isize = isize

        self._bn = torch.nn.BatchNorm1d(seq_len, eps=1e-3, momentum=0.1)

        self._relu = torch.nn.ReLU()



    def forward(self, x):

        x = x.view(1, self._seq_len, self._isize)

        x = self._conv1d(x)

        x = self._bn(x)

        x = self._relu(x)

        return x





if __name__ == '__main__':



    rknn = RKNN(verbose=True)

    rknn.config(batch_size=1,

                epochs=1)  # asymmetric_quantized-u8,  quantized_dtype='dynamic_fixed_point-16' , channel_mean_value='0 0 0 1', reorder_channel='0 1 2',

    system = platform.system()



    seq_len = 64

    isize = 672

    pt_file = 'conv1d.pt'



    x = torch.rand((1, seq_len, isize))

    #x = torch.linspace(-1, 1, steps=(isize * seq_len))

    x = x.view(1, 1, seq_len, -1)

    input_size_list = [[1, seq_len, isize]]

    #x1 = torch.full((1, hsize), fill_value=0.2, dtype=torch.float)

    ix = x.numpy()

    pt_m = MyConv1d(seq_len, isize)

    pt_m.eval()

    t0 = timer()

    out = pt_m.forward(x)

    t1 = timer()

    print("torch out, ", out.shape, "time:", t1 - t0)

    print(out.detach().numpy())



    jit_m = torch.jit.trace(pt_m, x)

    rm(pt_file)

    jit_m.save(pt_file)



    jit_m = torch.jit.load(pt_file)

    jit_m.eval()



    t0 = timer()

    out = jit_m.forward(x)

    t1 = timer()

    print("torch out, ", out.shape, "time:", t1 - t0)

    print(out.detach().numpy())



    use_pytorch = False

    if use_pytorch:

        rknn.load_pytorch(model=pt_file, input_size_list=input_size_list)

    else:

        onnx_file = pt_file + '.onnx'

        rm(onnx_file)

        torch.onnx.export(pt_m, x, onnx_file,

                          export_params=True, verbose=False,

                          input_names=['input'],

                          output_names=['output'],

                          opset_version=10

                          )

        tf_model = onnx.load(onnx_file)

        tf_rep = prepare(tf_model)



        tf_ix = ix

        t0 = timer()

        tf_out = tf_rep.run(tf_ix)

        t1 = timer()

        tf_out = np.array(tf_out)

        print("onnx out:", tf_ix.shape, ', ', tf_out.shape, " time:", t1 - t0)

        print(tf_out)

        rknn.load_onnx(model=onnx_file)



        # # onnx转为tf .pb并加载

        # pb_file = onnx_file + '.pb'

        # rm(pb_file)

        # tf_rep.export_graph(pb_file)

        # input_size_list = [[1, seq_len, isize]]

        # rknn.load_tensorflow(tf_pb=pb_file,

        #                      inputs=['input'],

        #                      outputs=['transpose_191'],

        #                      input_size_list=input_size_list)



    is_quant = False

    if is_quant:

        rknn_file = pt_file + '_quant.rknn'

    else:

        rknn_file = pt_file + '.rknn'



    print('--> Building model')

    ret = rknn.build(do_quantization=is_quant, dataset='./dataset.txt')

    if ret != 0:

        print('rknn.build failed!')

        exit(ret)

    print('build done')

    rm(rknn_file)

    rknn.export_rknn(rknn_file)



    print('--> Init runtime environment')

    if system == 'Windows':

        print("rknn run target rk1808")

        ret = rknn.init_runtime(target='rk1808', target_sub_class='AICS')

    else:

        print("rknn run target self")

        ret = rknn.init_runtime()

    if ret != 0:

        print('Init runtime environment failed')

        exit(ret)

    print('done')



    print('input:')

    print(ix)



    pt_m_fp16 = pt_m.cuda()

    pt_m_fp16.half()

    pt_m_fp16.eval()

    t0 = timer()

    pt_y = pt_m_fp16.forward(x.cuda().half())

    t1 = timer()

    print("torch out, cuda:fp16", pt_y.shape, "time:", t1 - t0)

    pt_y = pt_y.cpu().float().detach().numpy()

    print(pt_y)

    # t0 = timer()

    # torch_out = m.forward(x)

    # t1 = timer()

    # print("torch [out]:", x.shape, ', ', torch_out.shape, " time:", t1 - t0)

    # print(torch_out.detach().numpy())



    t0 = timer()

    rknn_out = rknn.inference(inputs=[ix], data_type='float32', data_format='nchw')   # , inputs_pass_through=[1]

    t1 = timer()



    if rknn_out is None:

        print('rknn inference failed')

        exit(-100)

    y = np.array(rknn_out[0])

    print("rknn out:", ix.shape, ', ', y.shape, " time:", t1 - t0)

    print(y)



    if y.shape[1] != pt_y.shape[1] or  y.shape[2] != pt_y.shape[2]:

        print("******* shape err: expect ", pt_y.shape, " but rknn out ", y.shape, " reshape:")

#        y = y.reshape(pt_y.shape[0], pt_y.shape[1], -1)

#        print(y)



    diff = np.abs(y - pt_y)

    cnt = y.shape[0] * y.shape[1] * y.shape[2]

    print("diff:")

    print(diff)

    d = diff[np.where(diff > 0.05)]

    print("warning values(diff > 0.05):", d.shape, " %:", 100.0 * np.array(d.shape, dtype=np.float) // cnt)

    print(d)

    d = diff[np.where(diff > 0.001)]

    print("warning values(diff > 0.001):", d.shape, " %:", 100.0 * np.array(d.shape, dtype=np.float) // cnt)

    print(d)

    d = diff[np.where(diff > 0.0001)]

    print("warning values(diff > 0.0001):", d.shape,  " %:", 100.0 * np.array(d.shape, dtype=np.float) // cnt)

    print(d)



    rknn.release()

复制代码

只看该作者 · 发表于 2020-3-21 22:45:17

NPU部门回复：
目前pytorch只支持NCHW这种4维输入的卷积，这个模型的卷积输入是3维，暂不支持。