海思NNIE开发(三):FasterRCNN在海思NNIE平台上的执行流程(二)
系列文章
海思NNIE开发(一):海思Hi3559AV100/Hi3519AV100 NNIE深度学习模块开发与调试记录
海思NNIE开发(二):FasterRCNN在海思NNIE平台上的执行流程(一)
海思NNIE开发(三):FasterRCNN在海思NNIE平台上的执行流程(二)
海思NNIE开发(五):基于Hi3559AV100的FasterRCNN、RFCN、SSD、Yolov2、Yolov3性能综合测评
------------------------------------------------------------------------------------------------------------------------------------
正文
本篇文章我们接着上一篇(海思NNIE开发二),继续分析FasterRCNN在海思NNIE平台上的执行流程。
1. 解析网络模型信息
首先我们来看以下加载模型网络信息的函数:
s32Ret = SAMPLE_COMM_SVP_NNIE_LoadModel(pcModelName,&s_stFasterRcnnModel);
这里的pcModelName就是wk文件的路径,s_stFasterRcnnModel是SAMPLE_SVP_NNIE_MODEL_S结构体,我们在上一篇(海思NNIE开发二)文章中已对该结构体做分析。我们进入该函数分析:
HI_S32 SAMPLE_COMM_SVP_NNIE_LoadModel(HI_CHAR * pszModelFile,SAMPLE_SVP_NNIE_MODEL_S *pstNnieModel){HI_S32 s32Ret = HI_INVALID_VALUE;HI_U64 u64PhyAddr = 0;HI_U8 *pu8VirAddr = NULL;HI_SL slFileSize = 0;/*打开网络模型文件,即*.wk文件, 再获取文件大小*/FILE *fp=fopen(pszModelFile,"rb");s32Ret = fseek(fp,0L,SEEK_END); // 文件指针指向文件尾slFileSize = ftell(fp); // 获取文件字节大小s32Ret = fseek(fp,0L,SEEK_SET);// 再将文件指针指向文件头/*malloc model file mem,根据文件大小计算需分配的物理地址及虚拟地址大小*/s32Ret = SAMPLE_COMM_SVP_MallocMem("SAMPLE_NNIE_MODEL",NULL,(HI_U64*)&u64PhyAddr,(void**)&pu8VirAddr,slFileSize);pstNnieModel->stModelBuf.u32Size = (HI_U32)slFileSize; /*文件大小*/pstNnieModel->stModelBuf.u64PhyAddr = u64PhyAddr;/*物理地址*/pstNnieModel->stModelBuf.u64VirAddr = (HI_U64)pu8VirAddr;/*虚拟地址*//*读取整个wk文件到虚拟地址*/s32Ret = fread(pu8VirAddr, slFileSize, 1, fp);SAMPLE_SVP_CHECK_EXPR_GOTO(1 != s32Ret,FAIL_1,SAMPLE_SVP_ERR_LEVEL_ERROR,"Error,read model file failed!\n");/*load model,从wk文件数据buf 中的模型中解析出网络模型*/s32Ret = HI_MPI_SVP_NNIE_LoadModel(&pstNnieModel->stModelBuf/*输入:模型数据buf*/,&pstNnieModel->stModel/*输出:网络模型结构体*/);fclose(fp);return HI_SUCCESS;FAIL_1:SAMPLE_SVP_MMZ_FREE(pstNnieModel->stModelBuf.u64PhyAddr,pstNnieModel->stModelBuf.u64VirAddr);pstNnieModel->stModelBuf.u32Size = 0;FAIL_0:if (NULL != fp){fclose(fp);}return HI_FAILURE;}
这个函数执行以下步骤:
获取wk文件字节大小
分配存储wk文件的内存空间
读取wk文件到内存空间
从wk文件的内存buf中解析出网络模型信息
执行完后,模型信息在s_stFasterRcnnModel.stModel结构体里,这个结构体里存储的是什么信息,可参考我上一篇文章(海思NNIE开发二),这里简单罗列各个段、输入输出节点的信息如下:
段 | 段类型/段类型值 | 输入/输出 | 节点名 | 节点类型/节点类型值 |
---|---|---|---|---|
第1段 |
SVP_NNIE_NET_TYPE_CNN/0 | 输入 | data | SVP_BLOB_TYPE_S32/0 |
输出 | conv5 | SVP_BLOB_TYPE_S32/0 | ||
rpn_cls_score | SVP_BLOB_TYPE_S32/0 | |||
rpn_bbox_pred | SVP_BLOB_TYPE_S32/0 | |||
rpn_cls_prob_reshape | SVP_BLOB_TYPE_S32/0 | |||
第2段 |
SVP_NNIE_NET_TYPE_ROI/1 | 输入 | conv5 | SVP_BLOB_TYPE_S32/0 |
输出 | bbox_pred | SVP_BLOB_TYPE_VEC_S32/4 | ||
cls_prob | SVP_BLOB_TYPE_VEC_S32/4 |
2. 初始化
解析完网络模型信息之后,结构体指针给到 SAMPLE_SVP_NNIE_PARAM_S s_stFasterRcnnNnieParam这个结构体中,如下:
s_stFasterRcnnNnieParam.pstModel = &s_stFasterRcnnModel.stModel;
我们接着看以下初始化函数:
s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_ParamInit(&stNnieCfg,&s_stFasterRcnnNnieParam,&s_stFasterRcnnSoftwareParam);
这个函数里面执行稍复杂,简单来说就是使用stNnieCfg等信息来初始化s_stFasterRcnnNnieParam,再使用s_stFasterRcnnNnieParam等来初始化s_stFasterRcnnSoftwareParam。该函数的实现如下:
static HI_S32 SAMPLE_SVP_NNIE_FasterRcnn_ParamInit(SAMPLE_SVP_NNIE_CFG_S* pstFasterRcnnCfg/*图片及框等信息*/,SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam/*模型信息*/, SAMPLE_SVP_NNIE_FASTERRCNN_SOFTWARE_PARAM_S* pstSoftWareParam){HI_S32 s32Ret = HI_SUCCESS;/*init hardware parameter*/s32Ret = SAMPLE_COMM_SVP_NNIE_ParamInit(pstFasterRcnnCfg,pstNnieParam);/*init software parameter*/s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_SoftwareInit(pstFasterRcnnCfg,pstNnieParam,pstSoftWareParam);return s32Ret;INIT_FAIL_0:s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_Deinit(pstNnieParam,pstSoftWareParam,NULL);return HI_FAILURE;}
分为SAMPLE_SVP_NNIE_ParamInit与SAMPLE_SVP_NNIE_FasterRcnn_SoftwareInit两个函数。我们首先看SAMPLE_COMM_SVP_NNIE_ParamInit,这个函数的实现里做了一些输入参数的有效判断后,就直接调用SAMPLE_SVP_NNIE_ParamInit,因此我们就直接看SAMPLE_SVP_NNIE_ParamInit的实现,在这个函数里首先调用:
s32Ret = SAMPLE_SVP_NNIE_FillForwardInfo(pstNnieCfg,pstNnieParam);
这个函数的实质就是使用pstNnieParam->pstModel->astSeg的信息来初始化pstNnieParam->astForwardWithBboxCtrl与pstNnieParam->astSegData这两个结构体,其实现如下:
static HI_S32 SAMPLE_SVP_NNIE_FillForwardInfo(SAMPLE_SVP_NNIE_CFG_S *pstNnieCfg/*图片及框等信息*/,SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam/*模型信息*/){HI_U32 i = 0, j = 0;HI_U32 u32Offset = 0;HI_U32 u32Num = 0;/*u32NetSegNum:网络模型中 NNIE 执行的网络分段数,在FasterRCNN中为2*/for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++){if(SVP_NNIE_NET_TYPE_ROI == pstNnieParam->pstModel->astSeg[i].enNetType)/*网络段的类型,SVP_NNIE_NET_TYPE_ROI为1*/{/*astForwardWithBboxCtrl:有 Bbox 输入的目标检测网络预测控制参数*/pstNnieParam->astForwardWithBboxCtrl[i].enNnieId = pstNnieCfg->aenNnieCoreId[i];//网络段的段序号,初始化时设置为0pstNnieParam->astForwardWithBboxCtrl[i].u32SrcNum = pstNnieParam->pstModel->astSeg[i].u16SrcNum;//网络段的输入节点数,这里为1pstNnieParam->astForwardWithBboxCtrl[i].u32DstNum = pstNnieParam->pstModel->astSeg[i].u16DstNum;//网络段的输出节点数,这里为2pstNnieParam->astForwardWithBboxCtrl[i].u32ProposalNum = 1;pstNnieParam->astForwardWithBboxCtrl[i].u32NetSegId = i;//网络段的段序号,这里为1pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf;// 辅助内存pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64PhyAddr= pstNnieParam->stTaskBuf.u64PhyAddr+u32Offset;// 内存块物理地址pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64VirAddr= pstNnieParam->stTaskBuf.u64VirAddr+u32Offset; // 内存块虚拟地址pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u32Size= pstNnieParam->au32TaskBufSize[i]; // 内存块字节数}else if(SVP_NNIE_NET_TYPE_CNN == pstNnieParam->pstModel->astSeg[i].enNetType ||SVP_NNIE_NET_TYPE_RECURRENT== pstNnieParam->pstModel->astSeg[i].enNetType){pstNnieParam->astForwardCtrl[i].enNnieId = pstNnieCfg->aenNnieCoreId[i];//网络段的段序号,初始化时设置为0pstNnieParam->astForwardCtrl[i].u32SrcNum = pstNnieParam->pstModel->astSeg[i].u16SrcNum;//网络段的输入节点数,这里为1pstNnieParam->astForwardCtrl[i].u32DstNum = pstNnieParam->pstModel->astSeg[i].u16DstNum;//网络段的输出节点数,这里为4pstNnieParam->astForwardCtrl[i].u32NetSegId = i;//网络段的段序号,这里为0pstNnieParam->astForwardCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf; // 辅助内存,这里为0pstNnieParam->astForwardCtrl[i].stTskBuf.u64PhyAddr= pstNnieParam->stTaskBuf.u64PhyAddr+u32Offset; // 内存块物理地址,这里为0pstNnieParam->astForwardCtrl[i].stTskBuf.u64VirAddr= pstNnieParam->stTaskBuf.u64VirAddr+u32Offset; // 内存块虚拟地址,这里为0pstNnieParam->astForwardCtrl[i].stTskBuf.u32Size= pstNnieParam->au32TaskBufSize[i]; // 内存块字节数,这里为0}u32Offset += pstNnieParam->au32TaskBufSize[i];// 网络任务各段辅助内存,这里为0/*fill src blob info,从pstModel中获取每一段的输入节点信息,存储于astSegData[i].astSrc*/for(j = 0; j < pstNnieParam->pstModel->astSeg[i].u16SrcNum; j++)// 第i段的第j个输入节点,FasterRCNN中每段都只有1个输入节点{/*FasterRCNN中第1段与第2段的第1个输入节点类型为SVP_BLOB_TYPE_U8*/if(SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType) // 0x5类型{pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType;pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u32Dim = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.u32Dim;pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieCfg->u32MaxInputNum; // 1pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u64VirAddrStep = pstNnieCfg->au64StepVirAddr[i*SAMPLE_SVP_NNIE_EACH_SEG_STEP_ADDR_NUM];}else{pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType; // 节点类型,这里为SVP_BLOB_TYPE_U8pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Chn = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Chn; //节点输入通道数pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Height = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Height; // 节点输入的高度pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Width = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Width; // 节点输入的宽度pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieCfg->u32MaxInputNum; // 1}}/* FasterRCNN中第1段的类型为SVP_NNIE_NET_TYPE_CNN, 第2段的类型为SVP_NNIE_NET_TYPE_ROI* u32MaxRoiNum为300,u32MaxInputNum为1 */if(SVP_NNIE_NET_TYPE_ROI == pstNnieParam->pstModel->astSeg[i].enNetType) // 0x1{u32Num = pstNnieCfg->u32MaxRoiNum*pstNnieCfg->u32MaxInputNum; // 300}else{u32Num = pstNnieCfg->u32MaxInputNum; // 这里为1}// FasterRcnn第1段有4个输出节点,都是SVP_BLOB_TYPE_S32类型;第2段有2个输出节点,都是SVP_BLOB_TYPE_VEC_S32类型for(j = 0; j < pstNnieParam->pstModel->astSeg[i].u16DstNum; j++)// 第i段的第j个输出节点{if(SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType)// 0x5类型{pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType;pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u32Dim =pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.u32Dim;pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num;pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u64VirAddrStep =pstNnieCfg->au64StepVirAddr[i*SAMPLE_SVP_NNIE_EACH_SEG_STEP_ADDR_NUM+1];}else{pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType;// 节点类型,pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Chn = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Chn; // 通道pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Height = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Height;// 高度pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Width = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Width;// 宽度pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num; // 第1段NNIE网络为1,第2段NNIE网络为300}}}return HI_SUCCESS;}
这个函数还对pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf、pstNnieParam->astForwardCtrl[i].stTskBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf、pstNnieParam->astForwardCtrl[i].stTmpBuf等参数进行初始化,其实这里没有必要,因为pstNnieParam->stTaskBuf、pstNnieParam->stTmpBuf等结构体的值也是空的,并没有做过申请内存的操作。
我们再来看第2个关键函数:
/*1. 计算网络各段的辅助内存大小2. 计算第1段第1个输入节点的Blob的辅助内存大小3. 计算各段第1个输出节点的Blob的辅助内存大小*/s32Ret = SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize(pstNnieCfg,pstNnieParam,&u32TotalTaskBufSize, /*输入&输出:输入值为0; 输出:网络各段辅助内存的总和*/&u32TmpBufSize, /*输入&输出,输入值为0; 输出:模型辅助内存大小*/astBlobSize, /*输入&输出:输入为空; 输出:各段第1个输入、输出节点辅助内存*/&u32TotalSize /*输入&输出:输入值为0; 输出为:段辅助内存+模型辅助内存+第1段第1个输入节点辅助内存+各段第1个输出节点辅助内存大小*/);
这个函数是计算各个段、各个段中的各个节点的的辅助内存大小。我们知道,在之前的load模型的步骤中,是已经获取到模型的辅助内存(pstNnieParam->pstModel->u32TmpBufSize),但各段、段中各个节点的辅助内存是不知道的,因此该函数就是获取这些辅助内存。该函数的实现如下:
static HI_S32 SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize(SAMPLE_SVP_NNIE_CFG_S *pstNnieCfg,SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam,HI_U32*pu32TotalTaskBufSize,/*输入&输出:输入值为0,输出:网络各段辅助内存的总和*/HI_U32*pu32TmpBufSize,/*输入&输出, 输入值为0, 输出:模型辅助内存大小*/SAMPLE_SVP_NNIE_BLOB_SIZE_S astBlobSize[], /*输入&输出:输入为空;输出:各段第1个输入、输出节点辅助内存*/HI_U32*pu32TotalSize/*输入&输出:输入值为0, 输出为:段辅助内存+模型辅助内存+第1段第1个输入节点辅助内存+各段第1个输出节点辅助内存大小*/){HI_S32 s32Ret = HI_SUCCESS;HI_U32 i = 0, j = 0;HI_U32 u32TotalStep = 0;/*Get each seg's task buf size*//*获取给定网络任务各段辅助内存大小*/s32Ret = HI_MPI_SVP_NNIE_GetTskBufSize(pstNnieCfg->u32MaxInputNum/*图片数量:1*/,pstNnieCfg->u32MaxRoiNum,// 输入,300pstNnieParam->pstModel,// 输入pstNnieParam->au32TaskBufSize,// 输出:网络任务各段辅助内存pstNnieParam->pstModel->u32NetSegNum);// 输入:网络任务的段数SAMPLE_SVP_CHECK_EXPR_RET(HI_SUCCESS != s32Ret,s32Ret,SAMPLE_SVP_ERR_LEVEL_ERROR,"Error,HI_MPI_SVP_NNIE_GetTaskSize failed!\n");/*Get total task buf size*/*pu32TotalTaskBufSize = 0;for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++){*pu32TotalTaskBufSize += pstNnieParam->au32TaskBufSize[i]; /*累加网络任务各段辅助内存*/}/*Get tmp buf size*/*pu32TmpBufSize = pstNnieParam->pstModel->u32TmpBufSize; // 模型辅助内存大小*pu32TotalSize += *pu32TotalTaskBufSize + *pu32TmpBufSize;// 段辅助内存+模型辅助内存/*calculate Blob mem size*/for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++){if(SVP_NNIE_NET_TYPE_RECURRENT == pstNnieParam->pstModel->astSeg[i].enNetType){for(j = 0; j < pstNnieParam->astSegData[i].astSrc[0].u32Num; j++){u32TotalStep += *((HI_S32*)pstNnieParam->astSegData[i].astSrc[0].unShape.stSeq.u64VirAddrStep+j);}}/*the first seg's Src Blob mem size, other seg's src blobs from the output blobs ofthose segs before it or from software output results*/if(i == 0){/*计算第1段第1个输入节点的Blob的辅助内存大小*/SAMPLE_SVP_NNIE_GetBlobMemSize(&(pstNnieParam->pstModel->astSeg[i].astSrcNode[0]), /*输入,第i段的第1个输入节点信息*/pstNnieParam->pstModel->astSeg[i].u16SrcNum, /*输入,这里是1*/u32TotalStep,/*输入,这里是0*/&(pstNnieParam->astSegData[i].astSrc[0]),/*第i段的第1个节点信息,在SAMPLE_SVP_NNIE_FillForwardInfo中已填充部分该结构体部分信息*/SAMPLE_SVP_NNIE_ALIGN_16, /*输入:内存对齐方式*/pu32TotalSize,/*输入&输出:输入为:段辅助内存+模型辅助内存;输出为:段辅助内存+模型辅助内存+输入节点辅助内存*/&(astBlobSize[i].au32SrcSize[0])/*输入&输出:输入为空;输出为:各个节点的Blob的辅助内存大小*/));}/*Get each seg's Dst Blob mem size*//*计算第1个输出节点的Blob的辅助内存大小*/SAMPLE_SVP_NNIE_GetBlobMemSize(&(pstNnieParam->pstModel->astSeg[i].astDstNode[0]),pstNnieParam->pstModel->astSeg[i].u16DstNum,u32TotalStep,&(pstNnieParam->astSegData[i].astDst[0]),SAMPLE_SVP_NNIE_ALIGN_16, pu32TotalSize, &(astBlobSize[i].au32DstSize[0]));}return s32Ret;}
在这个函数中,首先调用底层API HI_MPI_SVP_NNIE_GetTskBufSize获取到网络任务的各段的辅助内存pstNnieParam->au32TaskBufSize,然后再调用SAMPLE_SVP_NNIE_GetBlobMemSize计算第1段的第1个输入节点Blob的辅助内存,以及每段的第1个输出节点的Blob辅助内存。
回到SAMPLE_SVP_NNIE_ParamInit函数中,SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize执行完后,u32TotalSize为总的辅助内存大小(含模型、段、节点),此时调用:
s32Ret = SAMPLE_COMM_SVP_MallocCached("SAMPLE_NNIE_TASK",NULL,(HI_U64*)&u64PhyAddr,(void**)&pu8VirAddr,u32TotalSize);
分配内存空间。接着后面,再根据得到的虚拟内存地址、物理内存地址来初始化pstNnieParam->stTaskBuf、pstNnieParam->stTmpBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf、stNnieParam->astForwardCtrl[i].stTskBuf、stNnieParam->astSegData[i].astSrc[j]这些结构体中的内存地址值,这个才是真正的初始化,之前在SAMPLE_SVP_NNIE_FillForwardInfo函数中也有对这些结构体做初始化,但那是“假初始化”。
此致,SAMPLE_SVP_NNIE_ParamInit函数执行完毕。