OpenVINO CPU加速调研
理论部分
一. 介绍
OpenVINO™ 是用于优化和部署 AI 推理的开源工具包。
- 提升计算机视觉、自动语音识别、自然语言处理和其他常见任务中的深度学习性能
- 使用通过 TensorFlow、PyTorch 等流行框架训练的模型
- 减少资源需求并在从边缘到云的一系列英特尔® 平台上高效部署
训练、优化、部署
二. 优化原理
- Linear Operations Fusing(算子融合)
- Precision Calibration(精度校准)
其实就是指模型INT8量化, 当然也可以使用inter 的NNCF进行其他模型压缩操作
三. OpenVINO 常用工具介绍
- 深度学习模型优化器 Deep Learning Model Optimizer- 一种跨平台命令行工具,用于导入模型并准备它们以使用推理引擎进行最佳执行。模型优化器导入、转换和优化模型,这些模型在流行框架中训练过,例如 Caffe、TensorFlow、MXNet、Kaldi 和 ONNX*。
- 深度学习推理引擎 Deep Learning Inference Engine- 一个统一的 API,允许在许多硬件类型上进行高性能推理,包括英特尔 CPU、英特尔 集成显卡、英特尔 神经计算棒 2、采用英特尔 Movidius 视觉处理单元 (VPU) 的英特尔 视觉加速器设计.
- 推理引擎示例 Inference Engine Samples - 一组简单的控制台应用程序,演示如何在您的应用程序中使用推理引擎。
- 深度学习工作台 Deep Learning Workbench - 基于 Web 的图形环境,可让您轻松使用各种复杂的 OpenVINO 工具包组件。
- 训练后优化工具 Post-Training Optimization tool - 一种校准模型然后以 INT8 精度执行的工具。
- 附加工具 - 一组用于处理模型的工具,包括 (https://docs.openvinotoolkit.org/latest/openvino_inference_engine_tools_benchmark_tool_README.html), Cross Check Tool, Compile tool 工具
- Open Model Zoo Open Model Zoo
- Demos Demos - 控制台应用程序,提供强大的应用程序模板来帮助您实现特定的深度学习场景。
- 其他工具-与模型一起使用的一组工具,包括Accuracy Checker Utility和Model Downloader。Accuracy Checker Utility and Model Downloader.
- 预训练模型文档 Documentation for Pretrained Models - Open Model Zoo 存储库中提供的预训练模型文档。 Open Model Zoo repository.
实战部分
一. 环境准备
# 拉取并启动容器
docker pull openvino/ubuntu18_dev:latest
docker run -itd -p 8501:8501 -p 8500:8500 -p 8889:8889 -v "/root/openvino_notebooks:/openvino_notebooks" openvino/ubuntu18_dev:latest
# 进入容器
docker exec -it -u root bc89fe5f98e6 /bin/bash
# 拉取案例库
git clone --depth=1 https://github.com/openvinotoolkit/openvino_notebooks.git
# 安装jupyter
cd openvino_notebooks
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install python3-venv build-essential python3-dev git-all
python -m pip install --upgrade pip
pip install -r requirements.txt
python -m ipykernel install --user --name openvino_env
apt-get install vim
# 启动jupyter
jupyter lab notebooks --allow-root
二. 模型转换(使用jupyter notebook)
import time
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown
# Construct the command for Model Optimizer
mo_command = f"""mo
--saved_model_dir "/openvino_notebooks/open_model_zoo_models/custom/origin_model"
--data_type FP32
--input dense_input,sparse_ids_input,sparse_wgt_input,seq_50_input
--input_shape [100,587],[100,53],[100,53],[100,6,50]
--output_dir "/openvino_notebooks/open_model_zoo_models/custom/fp32"
--output "Identity"
"""
mo_command = " ".join(mo_command.split())
print("Model Optimizer command to convert TensorFlow to OpenVINO:")
display(Markdown(f"`{mo_command}`"))
! $mo_command
三. 模型量化(使用jupyter notebook)
import os
from pathlib import Path
from openvino.tools.pot import DataLoader
import tensorflow as tf
import math
from yaspin import yaspin
# 从TFRecord读取数据
def input_fn_tfrecord(filenames, batch_size=256):
"""make input fn for tfrecord file
"""
reader = tf.data.TFRecordDataset(
filenames,
num_parallel_reads=10,
).shuffle(100000, reshuffle_each_iteration=True)
features = {
'dense_input': tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
'sparse_ids_input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
'sparse_wgt_input': tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
'seq_50_input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
'is_click': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
}
def _parse_example(example):
"""
parse data
"""
parse_data = tf.io.parse_single_example(example, features)
return [
tf.reshape(parse_data['dense_input'][:587], shape=[587]),
tf.reshape(tf.cast(parse_data["sparse_ids_input"], tf.int32), shape=[53]),
tf.reshape(parse_data["sparse_wgt_input"], shape=[53]),
tf.reshape(tf.reshape(tf.cast(parse_data['seq_50_input'], tf.int32), [-1, 50])[:6, :], shape=[6, 50]),
tf.reshape(parse_data['is_click'], shape=[1])]
dataset = reader.map(_parse_example, num_parallel_calls=11) # 解析数据
dataset = dataset.prefetch(buffer_size=batch_size)
batch = dataset.batch(batch_size=batch_size)
return batch
# 数据预处理
data_file = "/openvino_notebooks/open_model_zoo_models/custom/eval_processed_data.tfrecords"
batch_size = 100
inputs_list = ['dense_input', 'sparse_ids_input', 'sparse_wgt_input', 'seq_50_input']
total_samples = sum(1 for _ in tf.compat.v1.python_io.tf_record_iterator(data_file))
n = math.ceil(float(total_samples) / batch_size)
data = []
with tf.compat.v1.Session() as sess:
dataset = input_fn_tfrecord(data_file, 100)
dataset_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
next_element = dataset_iterator.get_next()
next_element = sess.run(next_element)
for i in range(n):
records = {
'dense_input': next_element[0],
'sparse_ids_input': next_element[1],
'sparse_wgt_input': next_element[2],
'seq_50_input': next_element[3],
'label': next_element[4],
}
data.append(records)
class OriginModelDataLoader(DataLoader):
def __init__(self, data_list):
"""dataloader generator
Args:
data_location (str): tf recorder local path
batch_size (int): dataloader batch size
"""
self.data_list = data_list
def __getitem__(self, index):
if index >= len(self.data_list):
raise IndexError("Index out of dataset size")
current_item = self.data_list[index]
label = self.data_list[index]['label']
feat_names = {'dense_input', 'sparse_ids_input', 'sparse_wgt_input', 'seq_50_input'}
p2 = {key: value for key, value in current_item.items() if key in feat_names}
return ((index, label), p2)
def __len__(self):
return len(self.data_list)
# 执行模型量化
from openvino.tools.pot import IEEngine
import addict
from openvino.tools.pot import load_model,save_model
from openvino.tools.pot import compress_model_weights
from openvino.tools.pot import create_pipeline
from compression.api import DataLoader, Metric
path_to_xml = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
path_to_bin = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.bin"
data_file = "/openvino_notebooks/open_model_zoo_models/custom/eval_processed_data.tfrecords"
batch_size = 512
# Model config specifies the model name and paths to model .xml and .bin file
model_config = addict.Dict(
{
"model_name": "origin_model",
"model": path_to_xml,
"weights": path_to_bin,
}
)
# Engine config
engine_config = addict.Dict({"device": "CPU"})
algorithms = [
{
"name": "AccuracyAwareQuantization",
"params": {
"target_device": "CPU",
"stat_subset_size": 300,
"maximal_drop": 0.001, # 制定精度损失不超过0.001
},
}
]
# Step 1: implement and create user's data loader
data_loader = OriginModelDataLoader(data)
# Step 2: load model
ir_model = load_model(model_config=model_config)
metric = Accuracy()
# Step 3: Initialize the engine for metric calculation and statistics collection.
engine = IEEngine(config=engine_config, data_loader=data_loader, metric=metric)
# Step 4: Create a pipeline of compression algorithms and run it.
pipeline = create_pipeline(algorithms, engine)
algorithm_name = pipeline.algo_seq[0].name
with yaspin(
text=f"Executing POT pipeline on {model_config['model']} with {algorithm_name}"
) as sp:
start_time = time.perf_counter()
compressed_model = pipeline.run(ir_model)
end_time = time.perf_counter()
sp.ok("✔")
print(f"Quantization finished in {end_time - start_time:.2f} seconds")
# Step 5 (Optional): Compress model weights to quantized precision
# in order to reduce the size of the final .bin file.
compress_model_weights(compressed_model)
# Step 6: Save the compressed model to the desired path.
# Set save_path to the directory where the model should be saved
compressed_model_paths = save_model(
model=compressed_model,
save_path="optimized_model",
model_name="optimized_model",
)
# Step 7 (Optional): Evaluate the compressed model. Print the results.
metric_results = pipeline.evaluate(compressed_model)
original_metric_results = pipeline.evaluate(ir_model)
if original_metric_results:
print(f"Accuracy of the original model: {next(iter(original_metric_results.values())):.5f}")
quantized_metric_results = pipeline.evaluate(compressed_model)
if quantized_metric_results:
print(f"Accuracy of the quantized model: {next(iter(quantized_metric_results.values())):.5f}")
优化前后对比测试
# 优化前后模型大小比较
ir_path = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
quantized_model_path = "/openvino_notebooks/notebooks/002-openvino-api/optimized_model/optimized_model.xml"
original_model_size = Path(ir_path).with_suffix(".bin").stat().st_size / 1024
quantized_model_size = Path(quantized_model_path).with_suffix(".bin").stat().st_size / 1024
compression_ratio = (34231.60-12384.25)/342.3160
print(f"FP32 model size: {original_model_size:.2f} KB")
print(f"INT8 model size: {quantized_model_size:.2f} KB")
print(f"Compression ratio : {compression_ratio:.4f}%")
# 模型性能比较, benchmark_app是openvion官方提供的性能测试工具
#!benchmark_app --help
model_name = "quantized_model"
benchmark_command = f"benchmark_app -m {quantized_model_path} -t 15 -d CPU -api async -hint latency"
display(Markdown(f"Benchmark command: `{benchmark_command}`"))
display(Markdown(f"Benchmarking {model_name} on CPU with async inference for 15 seconds..."))
! $benchmark_command
#!benchmark_app --help
model_path = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
model_name = "origin_model"
benchmark_command = f"benchmark_app -m {model_path} -t 15 -hint latency "
display(Markdown(f"Benchmark command: `{benchmark_command}`"))
display(Markdown(f"Benchmarking {model_name} on CPU with async inference for 15 seconds..."))
! $benchmark_command
四. 实验结论
模型名称 |
大小 |
QPS |
---|---|---|
origin_model |
|
88.93 |
quantiztion model |
|
105.58 |
优化比率 |
减少了 |
18.72% |
通过观察转换期间日志, 发现由于模型结构比较简单紧凑, 特征也非常稀疏, 导致转换时可以被算子融合和量化的节点并不多, 故性能提升不是特别明显.