SlideShare a Scribd company logo
1 of 42
Download to read offline
Step 1: TF모델을 TRT 포맷으로 변환
Step 2: 모델 Parser 생성
Step 3: 입/출력 레이어 정보 입력
Step 4: 모델의 최적화 및
런타임 Engine 생성
Step 5: 엔진을 파일로 저장
Step 6: 엔진을 파일에서 읽음
Step 7: Inference 수행
•
•
•
•
•
•
•
•
•
PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f);
if (mWeights.type == DataType::__float) {
CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels,
mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel),
reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]),
zerof, mChannelShared ? mNbInputChannels : 1, stream));
} else { // DataType::kFLOAT }
return 0;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const
Ftype* in, Ftype* out, const Ftype zero, const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
out[index] = (in[index] > (Ftype(zero))) ? in[index] :
in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c);
}
}
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
return cudaGetLastError();
}
IPluginExt *PReLUPlugin::clone() const override {
return new PReLUPlugin(&mWeights, 1);
}
IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override {
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory parserPluginFactory;
parser->setPluginFactoryExt(&parserPluginFactory);
const IBlobNameToTensor *blobNameToTensor =
parser->parse(gParams.deployFile.c_str(), // caffe deploy file
gParams.modelFile.c_str(), // caffe model file
*network, // network definition that the parser will populate
gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize) << 20);
builder->setFp16Mode(gParams.fp16);
ICudaEngine* engine = builder->buildCudaEngine(*network);
void PReLUPlugin::serialize(void *buffer) {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount);
write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights);
assert(d == a + getSerializationSize());
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth);
read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count);
read<DataType>(d, mWeights.type);
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type));
assert(d == a + length);
}
Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override
{
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory pluginFactory;
engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamCreate(&stream));
IExecutionContext* context = engine->createExecutionContext();
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화
[232]TensorRT를 활용한 딥러닝 Inference 최적화

More Related Content

What's hot

Go Profiling - John Graham-Cumming
Go Profiling - John Graham-Cumming Go Profiling - John Graham-Cumming
Go Profiling - John Graham-Cumming
Cloudflare
 
Vagrant + Rouster at salesforce.com - PuppetConf 2013
Vagrant + Rouster at salesforce.com - PuppetConf 2013Vagrant + Rouster at salesforce.com - PuppetConf 2013
Vagrant + Rouster at salesforce.com - PuppetConf 2013
Puppet
 

What's hot (20)

Redis & ZeroMQ: How to scale your application
Redis & ZeroMQ: How to scale your applicationRedis & ZeroMQ: How to scale your application
Redis & ZeroMQ: How to scale your application
 
Redis as a message queue
Redis as a message queueRedis as a message queue
Redis as a message queue
 
Bridge TensorFlow to run on Intel nGraph backends (v0.4)
Bridge TensorFlow to run on Intel nGraph backends (v0.4)Bridge TensorFlow to run on Intel nGraph backends (v0.4)
Bridge TensorFlow to run on Intel nGraph backends (v0.4)
 
RestMQ - HTTP/Redis based Message Queue
RestMQ - HTTP/Redis based Message QueueRestMQ - HTTP/Redis based Message Queue
RestMQ - HTTP/Redis based Message Queue
 
Php 5.6 From the Inside Out
Php 5.6 From the Inside OutPhp 5.6 From the Inside Out
Php 5.6 From the Inside Out
 
Memory Management of C# with Unity Native Collections
Memory Management of C# with Unity Native CollectionsMemory Management of C# with Unity Native Collections
Memory Management of C# with Unity Native Collections
 
Qt Rest Server
Qt Rest ServerQt Rest Server
Qt Rest Server
 
Bridge TensorFlow to run on Intel nGraph backends (v0.5)
Bridge TensorFlow to run on Intel nGraph backends (v0.5)Bridge TensorFlow to run on Intel nGraph backends (v0.5)
Bridge TensorFlow to run on Intel nGraph backends (v0.5)
 
Linux fundamental - Chap 14 shell script
Linux fundamental - Chap 14 shell scriptLinux fundamental - Chap 14 shell script
Linux fundamental - Chap 14 shell script
 
Python Load Testing - Pygotham 2012
Python Load Testing - Pygotham 2012Python Load Testing - Pygotham 2012
Python Load Testing - Pygotham 2012
 
Go Profiling - John Graham-Cumming
Go Profiling - John Graham-Cumming Go Profiling - John Graham-Cumming
Go Profiling - John Graham-Cumming
 
Node.js - Best practices
Node.js  - Best practicesNode.js  - Best practices
Node.js - Best practices
 
Node child process
Node child processNode child process
Node child process
 
Vagrant + Rouster at salesforce.com - PuppetConf 2013
Vagrant + Rouster at salesforce.com - PuppetConf 2013Vagrant + Rouster at salesforce.com - PuppetConf 2013
Vagrant + Rouster at salesforce.com - PuppetConf 2013
 
Using Puppet to Create a Dynamic Network - PuppetConf 2013
Using Puppet to Create a Dynamic Network - PuppetConf 2013Using Puppet to Create a Dynamic Network - PuppetConf 2013
Using Puppet to Create a Dynamic Network - PuppetConf 2013
 
Go memory
Go memoryGo memory
Go memory
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in Gecko
 
Binary Studio Academy: Concurrency in C# 5.0
Binary Studio Academy: Concurrency in C# 5.0Binary Studio Academy: Concurrency in C# 5.0
Binary Studio Academy: Concurrency in C# 5.0
 
Cooking pies with Celery
Cooking pies with CeleryCooking pies with Celery
Cooking pies with Celery
 
Openstack 簡介
Openstack 簡介Openstack 簡介
Openstack 簡介
 

Similar to [232]TensorRT를 활용한 딥러닝 Inference 최적화

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript
Qiangning Hong
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
bradburgess22840
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
carliotwaycave
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests
Tomek Kaczanowski
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access Runbook
Taha Shakeel
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdf
malavshah9013
 
Below is the question I need help with. It need to be done in Java. .pdf
Below is the question I need help with. It need to be done in Java. .pdfBelow is the question I need help with. It need to be done in Java. .pdf
Below is the question I need help with. It need to be done in Java. .pdf
aroraenterprisesmbd
 

Similar to [232]TensorRT를 활용한 딥러닝 Inference 최적화 (20)

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
 
Kapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EngineKapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing Engine
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests
 
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
 
How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)
 
Microkernel Development
Microkernel DevelopmentMicrokernel Development
Microkernel Development
 
How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?
 
The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access Runbook
 
NodeJs
NodeJsNodeJs
NodeJs
 
The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185
 
2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests
 
MT_01_unittest_python.pdf
MT_01_unittest_python.pdfMT_01_unittest_python.pdf
MT_01_unittest_python.pdf
 
Where the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsWhere the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-Optimisations
 
Keras and TensorFlow
Keras and TensorFlowKeras and TensorFlow
Keras and TensorFlow
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdf
 
The Ring programming language version 1.5.1 book - Part 7 of 180
The Ring programming language version 1.5.1 book - Part 7 of 180The Ring programming language version 1.5.1 book - Part 7 of 180
The Ring programming language version 1.5.1 book - Part 7 of 180
 
Below is the question I need help with. It need to be done in Java. .pdf
Below is the question I need help with. It need to be done in Java. .pdfBelow is the question I need help with. It need to be done in Java. .pdf
Below is the question I need help with. It need to be done in Java. .pdf
 

More from NAVER D2

More from NAVER D2 (20)

[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다
 
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
 
[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기
 
[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발
 
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
 
[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A
 
[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기
 
[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning
 
[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications
 
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingOld version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
 
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
 
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
 
[224]네이버 검색과 개인화
[224]네이버 검색과 개인화[224]네이버 검색과 개인화
[224]네이버 검색과 개인화
 
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
 
[213] Fashion Visual Search
[213] Fashion Visual Search[213] Fashion Visual Search
[213] Fashion Visual Search
 
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
 
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
 
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
 
[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?
 
[231] Clova 화자인식
[231] Clova 화자인식[231] Clova 화자인식
[231] Clova 화자인식
 

Recently uploaded

Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
vu2urc
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
Joaquim Jorge
 

Recently uploaded (20)

Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
Tech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdfTech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdf
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Strategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherStrategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a Fresher
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CV
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 

[232]TensorRT를 활용한 딥러닝 Inference 최적화

  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. Step 1: TF모델을 TRT 포맷으로 변환 Step 2: 모델 Parser 생성 Step 3: 입/출력 레이어 정보 입력 Step 4: 모델의 최적화 및 런타임 Engine 생성 Step 5: 엔진을 파일로 저장 Step 6: 엔진을 파일에서 읽음 Step 7: Inference 수행
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 24. • • • PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) { mWeights = weights[0]; mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type)); }
  • 25. int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f); if (mWeights.type == DataType::__float) { CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels, mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel), reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]), zerof, mChannelShared ? mNbInputChannels : 1, stream)); } else { // DataType::kFLOAT } return 0; }
  • 26. template <typename Ftype> __global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const Ftype* in, Ftype* out, const Ftype zero, const int div_factor) { CUDA_KERNEL_LOOP(index, n) { int c = (index / dim) % channels / div_factor; out[index] = (in[index] > (Ftype(zero))) ? in[index] : in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c); } }
  • 27. template <typename Ftype> cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel, const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) { PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>> (count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor); return cudaGetLastError(); }
  • 28.
  • 29. IPluginExt *PReLUPlugin::clone() const override { return new PReLUPlugin(&mWeights, 1); } IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override { return new PReLUPlugin(serialData, serialLength); }
  • 30. PluginFactory parserPluginFactory; parser->setPluginFactoryExt(&parserPluginFactory); const IBlobNameToTensor *blobNameToTensor = parser->parse(gParams.deployFile.c_str(), // caffe deploy file gParams.modelFile.c_str(), // caffe model file *network, // network definition that the parser will populate gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
  • 32. void PReLUPlugin::serialize(void *buffer) { char *d = static_cast<char *>(buffer), *a = d; write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount); write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type); convertAndCopyToBuffer(d, mWeights); assert(d == a + getSerializationSize()); }
  • 33. PReLUPlugin::PReLUPlugin(const void *data, size_t length) { const char *d = static_cast<const char *>(data), *a = d; read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth); read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count); read<DataType>(d, mWeights.type); mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type)); deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type)); assert(d == a + length); }
  • 34. Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override { return new PReLUPlugin(serialData, serialLength); }
  • 35. PluginFactory pluginFactory; engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
  • 36. cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream); context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); cudaStreamCreate(&stream)); IExecutionContext* context = engine->createExecutionContext();