SlideShare une entreprise Scribd logo
1  sur  42
Télécharger pour lire hors ligne
Step 1: TF모델을 TRT 포맷으로 변환
Step 2: 모델 Parser 생성
Step 3: 입/출력 레이어 정보 입력
Step 4: 모델의 최적화 및
런타임 Engine 생성
Step 5: 엔진을 파일로 저장
Step 6: 엔진을 파일에서 읽음
Step 7: Inference 수행
•
•
•
•
•
•
•
•
•
PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f);
if (mWeights.type == DataType::__float) {
CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels,
mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel),
reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]),
zerof, mChannelShared ? mNbInputChannels : 1, stream));
} else { // DataType::kFLOAT }
return 0;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const
Ftype* in, Ftype* out, const Ftype zero, const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
out[index] = (in[index] > (Ftype(zero))) ? in[index] :
in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c);
}
}
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
return cudaGetLastError();
}
IPluginExt *PReLUPlugin::clone() const override {
return new PReLUPlugin(&mWeights, 1);
}
IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override {
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory parserPluginFactory;
parser->setPluginFactoryExt(&parserPluginFactory);
const IBlobNameToTensor *blobNameToTensor =
parser->parse(gParams.deployFile.c_str(), // caffe deploy file
gParams.modelFile.c_str(), // caffe model file
*network, // network definition that the parser will populate
gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize) << 20);
builder->setFp16Mode(gParams.fp16);
ICudaEngine* engine = builder->buildCudaEngine(*network);
void PReLUPlugin::serialize(void *buffer) {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount);
write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights);
assert(d == a + getSerializationSize());
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth);
read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count);
read<DataType>(d, mWeights.type);
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type));
assert(d == a + length);
}
Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override
{
return new PReLUPlugin(serialData, serialLength);
}
PluginFactory pluginFactory;
engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float),
cudaMemcpyHostToDevice, stream);
context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamCreate(&stream));
IExecutionContext* context = engine->createExecutionContext();
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화
[232] TensorRT를 활용한 딥러닝 Inference 최적화

Contenu connexe

Tendances

The innerHTML Apocalypse
The innerHTML ApocalypseThe innerHTML Apocalypse
The innerHTML ApocalypseMario Heiderich
 
Asynchronous JS in Odoo
Asynchronous JS in OdooAsynchronous JS in Odoo
Asynchronous JS in OdooOdoo
 
Valgrind tutorial
Valgrind tutorialValgrind tutorial
Valgrind tutorialSatabdi Das
 
Hacking Oracle From Web Apps 1 9
Hacking Oracle From Web Apps 1 9Hacking Oracle From Web Apps 1 9
Hacking Oracle From Web Apps 1 9sumsid1234
 
Functional Programming Patterns (BuildStuff '14)
Functional Programming Patterns (BuildStuff '14)Functional Programming Patterns (BuildStuff '14)
Functional Programming Patterns (BuildStuff '14)Scott Wlaschin
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query LanguageJulian Hyde
 
Kvm performance optimization for ubuntu
Kvm performance optimization for ubuntuKvm performance optimization for ubuntu
Kvm performance optimization for ubuntuSim Janghoon
 
Namespaces and cgroups - the basis of Linux containers
Namespaces and cgroups - the basis of Linux containersNamespaces and cgroups - the basis of Linux containers
Namespaces and cgroups - the basis of Linux containersKernel TLV
 
LinuxCon 2015 Linux Kernel Networking Walkthrough
LinuxCon 2015 Linux Kernel Networking WalkthroughLinuxCon 2015 Linux Kernel Networking Walkthrough
LinuxCon 2015 Linux Kernel Networking WalkthroughThomas Graf
 
Network Automation: Ansible 102
Network Automation: Ansible 102Network Automation: Ansible 102
Network Automation: Ansible 102APNIC
 
Tools for Solving Performance Issues
Tools for Solving Performance IssuesTools for Solving Performance Issues
Tools for Solving Performance IssuesOdoo
 
Cours formulaire html - niveau débutant
Cours formulaire html - niveau débutantCours formulaire html - niveau débutant
Cours formulaire html - niveau débutantTheBest Icanbe
 
C++20 the small things - Timur Doumler
C++20 the small things - Timur DoumlerC++20 the small things - Timur Doumler
C++20 the small things - Timur Doumlercorehard_by
 
Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021
 Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021 Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021
Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021StreamNative
 

Tendances (20)

The innerHTML Apocalypse
The innerHTML ApocalypseThe innerHTML Apocalypse
The innerHTML Apocalypse
 
Asynchronous JS in Odoo
Asynchronous JS in OdooAsynchronous JS in Odoo
Asynchronous JS in Odoo
 
たのしい関数型
たのしい関数型たのしい関数型
たのしい関数型
 
TensorRT survey
TensorRT surveyTensorRT survey
TensorRT survey
 
Dpdk performance
Dpdk performanceDpdk performance
Dpdk performance
 
Rust
RustRust
Rust
 
Valgrind tutorial
Valgrind tutorialValgrind tutorial
Valgrind tutorial
 
Hacking Oracle From Web Apps 1 9
Hacking Oracle From Web Apps 1 9Hacking Oracle From Web Apps 1 9
Hacking Oracle From Web Apps 1 9
 
Functional Programming Patterns (BuildStuff '14)
Functional Programming Patterns (BuildStuff '14)Functional Programming Patterns (BuildStuff '14)
Functional Programming Patterns (BuildStuff '14)
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query Language
 
Kvm performance optimization for ubuntu
Kvm performance optimization for ubuntuKvm performance optimization for ubuntu
Kvm performance optimization for ubuntu
 
Namespaces and cgroups - the basis of Linux containers
Namespaces and cgroups - the basis of Linux containersNamespaces and cgroups - the basis of Linux containers
Namespaces and cgroups - the basis of Linux containers
 
LinuxCon 2015 Linux Kernel Networking Walkthrough
LinuxCon 2015 Linux Kernel Networking WalkthroughLinuxCon 2015 Linux Kernel Networking Walkthrough
LinuxCon 2015 Linux Kernel Networking Walkthrough
 
Network Automation: Ansible 102
Network Automation: Ansible 102Network Automation: Ansible 102
Network Automation: Ansible 102
 
Tools for Solving Performance Issues
Tools for Solving Performance IssuesTools for Solving Performance Issues
Tools for Solving Performance Issues
 
Cours formulaire html - niveau débutant
Cours formulaire html - niveau débutantCours formulaire html - niveau débutant
Cours formulaire html - niveau débutant
 
C++20 the small things - Timur Doumler
C++20 the small things - Timur DoumlerC++20 the small things - Timur Doumler
C++20 the small things - Timur Doumler
 
Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021
 Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021 Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021
Infinitic: Building a Workflow Engine on Top of Pulsar - Pulsar Summit NA 2021
 
New PHP Exploitation Techniques
New PHP Exploitation TechniquesNew PHP Exploitation Techniques
New PHP Exploitation Techniques
 
Cours javascript v1
Cours javascript v1Cours javascript v1
Cours javascript v1
 

Similaire à [232] TensorRT를 활용한 딥러닝 Inference 최적화

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScriptQiangning Hong
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxbradburgess22840
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in GeckoChih-Hsuan Kuo
 
Kapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EngineKapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EnginePrashant Vats
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxcarliotwaycave
 
Php 5.6 From the Inside Out
Php 5.6 From the Inside OutPhp 5.6 From the Inside Out
Php 5.6 From the Inside OutFerenc Kovács
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good TestsTomek Kaczanowski
 
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰KAI CHU CHUNG
 
How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)OPNFV
 
How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?reallavalamp
 
The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184Mahmoud Samir Fayed
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access RunbookTaha Shakeel
 
NodeJs
NodeJsNodeJs
NodeJsdizabl
 
The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185Mahmoud Samir Fayed
 
2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good TestsTomek Kaczanowski
 
MT_01_unittest_python.pdf
MT_01_unittest_python.pdfMT_01_unittest_python.pdf
MT_01_unittest_python.pdfHans Jones
 
Where the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsWhere the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsMatt Warren
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfmalavshah9013
 

Similaire à [232] TensorRT를 활용한 딥러닝 Inference 최적화 (20)

服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript服务框架: Thrift & PasteScript
服务框架: Thrift & PasteScript
 
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docxIn Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
In Class AssignmetzCST280W13a-1.pdfCST 280 In-Class Pract.docx
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in Gecko
 
Kapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing EngineKapacitor - Real Time Data Processing Engine
Kapacitor - Real Time Data Processing Engine
 
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docxInstruction1. Please read the two articles. (Kincheloe part 1 &.docx
Instruction1. Please read the two articles. (Kincheloe part 1 &.docx
 
Php 5.6 From the Inside Out
Php 5.6 From the Inside OutPhp 5.6 From the Inside Out
Php 5.6 From the Inside Out
 
33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests33rd Degree 2013, Bad Tests, Good Tests
33rd Degree 2013, Bad Tests, Good Tests
 
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰如何透過 Go-kit 快速搭建微服務架構應用程式實戰
如何透過 Go-kit 快速搭建微服務架構應用程式實戰
 
How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)How Many Ohs? (An Integration Guide to Apex & Triple-o)
How Many Ohs? (An Integration Guide to Apex & Triple-o)
 
Microkernel Development
Microkernel DevelopmentMicrokernel Development
Microkernel Development
 
How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?How Does Kubernetes Build OpenAPI Specifications?
How Does Kubernetes Build OpenAPI Specifications?
 
The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184The Ring programming language version 1.5.3 book - Part 8 of 184
The Ring programming language version 1.5.3 book - Part 8 of 184
 
VPN Access Runbook
VPN Access RunbookVPN Access Runbook
VPN Access Runbook
 
NodeJs
NodeJsNodeJs
NodeJs
 
The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185The Ring programming language version 1.5.4 book - Part 8 of 185
The Ring programming language version 1.5.4 book - Part 8 of 185
 
2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests2012 JDays Bad Tests Good Tests
2012 JDays Bad Tests Good Tests
 
MT_01_unittest_python.pdf
MT_01_unittest_python.pdfMT_01_unittest_python.pdf
MT_01_unittest_python.pdf
 
Where the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-OptimisationsWhere the wild things are - Benchmarking and Micro-Optimisations
Where the wild things are - Benchmarking and Micro-Optimisations
 
Keras and TensorFlow
Keras and TensorFlowKeras and TensorFlow
Keras and TensorFlow
 
Create a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdfCreate a JAVA program that performs file IO and database interaction.pdf
Create a JAVA program that performs file IO and database interaction.pdf
 

Plus de NAVER D2

[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다NAVER D2
 
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...NAVER D2
 
[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기NAVER D2
 
[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발NAVER D2
 
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈NAVER D2
 
[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&ANAVER D2
 
[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기NAVER D2
 
[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep LearningNAVER D2
 
[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applicationsNAVER D2
 
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingOld version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingNAVER D2
 
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지NAVER D2
 
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기NAVER D2
 
[224]네이버 검색과 개인화
[224]네이버 검색과 개인화[224]네이버 검색과 개인화
[224]네이버 검색과 개인화NAVER D2
 
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)NAVER D2
 
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기NAVER D2
 
[213] Fashion Visual Search
[213] Fashion Visual Search[213] Fashion Visual Search
[213] Fashion Visual SearchNAVER D2
 
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지NAVER D2
 
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터NAVER D2
 
[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?NAVER D2
 
[231] Clova 화자인식
[231] Clova 화자인식[231] Clova 화자인식
[231] Clova 화자인식NAVER D2
 

Plus de NAVER D2 (20)

[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다[211] 인공지능이 인공지능 챗봇을 만든다
[211] 인공지능이 인공지능 챗봇을 만든다
 
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
[233] 대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing: Maglev Hashing Scheduler i...
 
[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기[215] Druid로 쉽고 빠르게 데이터 분석하기
[215] Druid로 쉽고 빠르게 데이터 분석하기
 
[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발[245]Papago Internals: 모델분석과 응용기술 개발
[245]Papago Internals: 모델분석과 응용기술 개발
 
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
[236] 스트림 저장소 최적화 이야기: 아파치 드루이드로부터 얻은 교훈
 
[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A[235]Wikipedia-scale Q&A
[235]Wikipedia-scale Q&A
 
[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기[244]로봇이 현실 세계에 대해 학습하도록 만들기
[244]로봇이 현실 세계에 대해 학습하도록 만들기
 
[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning[243] Deep Learning to help student’s Deep Learning
[243] Deep Learning to help student’s Deep Learning
 
[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications[234]Fast & Accurate Data Annotation Pipeline for AI applications
[234]Fast & Accurate Data Annotation Pipeline for AI applications
 
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load BalancingOld version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
Old version: [233]대형 컨테이너 클러스터에서의 고가용성 Network Load Balancing
 
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
[226]NAVER 광고 deep click prediction: 모델링부터 서빙까지
 
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
[225]NSML: 머신러닝 플랫폼 서비스하기 & 모델 튜닝 자동화하기
 
[224]네이버 검색과 개인화
[224]네이버 검색과 개인화[224]네이버 검색과 개인화
[224]네이버 검색과 개인화
 
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
[216]Search Reliability Engineering (부제: 지진에도 흔들리지 않는 네이버 검색시스템)
 
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
[214] Ai Serving Platform: 하루 수 억 건의 인퍼런스를 처리하기 위한 고군분투기
 
[213] Fashion Visual Search
[213] Fashion Visual Search[213] Fashion Visual Search
[213] Fashion Visual Search
 
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
[242]컴퓨터 비전을 이용한 실내 지도 자동 업데이트 방법: 딥러닝을 통한 POI 변화 탐지
 
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
[212]C3, 데이터 처리에서 서빙까지 가능한 하둡 클러스터
 
[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?[223]기계독해 QA: 검색인가, NLP인가?
[223]기계독해 QA: 검색인가, NLP인가?
 
[231] Clova 화자인식
[231] Clova 화자인식[231] Clova 화자인식
[231] Clova 화자인식
 

Dernier

What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024Stephanie Beckett
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxLoriGlavin3
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxLoriGlavin3
 
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxLoriGlavin3
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyAlfredo García Lavilla
 
DSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningDSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningLars Bell
 
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024BookNet Canada
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxLoriGlavin3
 
Take control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteTake control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteDianaGray10
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.Curtis Poe
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenHervé Boutemy
 
SALESFORCE EDUCATION CLOUD | FEXLE SERVICES
SALESFORCE EDUCATION CLOUD | FEXLE SERVICESSALESFORCE EDUCATION CLOUD | FEXLE SERVICES
SALESFORCE EDUCATION CLOUD | FEXLE SERVICESmohitsingh558521
 
TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024Lonnie McRorey
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsRizwan Syed
 
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfHyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfPrecisely
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfMounikaPolabathina
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity PlanDatabarracks
 

Dernier (20)

What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
 
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptx
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easy
 
DSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine TuningDSPy a system for AI to Write Prompts and Do Fine Tuning
DSPy a system for AI to Write Prompts and Do Fine Tuning
 
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
 
Take control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test SuiteTake control of your SAP testing with UiPath Test Suite
Take control of your SAP testing with UiPath Test Suite
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache Maven
 
SALESFORCE EDUCATION CLOUD | FEXLE SERVICES
SALESFORCE EDUCATION CLOUD | FEXLE SERVICESSALESFORCE EDUCATION CLOUD | FEXLE SERVICES
SALESFORCE EDUCATION CLOUD | FEXLE SERVICES
 
TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL Certs
 
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdfHyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
Hyperautomation and AI/ML: A Strategy for Digital Transformation Success.pdf
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdf
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity Plan
 

[232] TensorRT를 활용한 딥러닝 Inference 최적화

  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. Step 1: TF모델을 TRT 포맷으로 변환 Step 2: 모델 Parser 생성 Step 3: 입/출력 레이어 정보 입력 Step 4: 모델의 최적화 및 런타임 Engine 생성 Step 5: 엔진을 파일로 저장 Step 6: 엔진을 파일에서 읽음 Step 7: Inference 수행
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 24. • • • PReLUPlugin::PReLUPlugin(const Weights *weights, int nbWeights) { mWeights = weights[0]; mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type)); }
  • 25. int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { const float zerof{0.0f}; const __half zeroh = fp16::__float2half(0.0f); if (mWeights.type == DataType::__float) { CHECK(Forward_gpu<__float>(batchSize * mNbInputCount, mNbInputChannels, mNbInputHeight * mNbInputHeight, reinterpret_cast<const __float *>(mDeviceKernel), reinterpret_cast<const __float *>(inputs[0]), reinterpret_cast<__float *>(outputs[0]), zerof, mChannelShared ? mNbInputChannels : 1, stream)); } else { // DataType::kFLOAT } return 0; }
  • 26. template <typename Ftype> __global__ void PReLUForward(const int n, const int channels, const int dim, const Ftype* slope_data, const Ftype* in, Ftype* out, const Ftype zero, const int div_factor) { CUDA_KERNEL_LOOP(index, n) { int c = (index / dim) % channels / div_factor; out[index] = (in[index] > (Ftype(zero))) ? in[index] : in[index] * *(reinterpret_cast<const Ftype*>(slope_data)+c); } }
  • 27. template <typename Ftype> cudaError_t Forward_gpu(const int count, const int channels, const int dim, const Ftype* mDeviceKernel, const Ftype* bottom_data, Ftype* top_data, const Ftype zero, const int div_factor, const cudaStream_t stream) { PReLUForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>> (count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor); return cudaGetLastError(); }
  • 28.
  • 29. IPluginExt *PReLUPlugin::clone() const override { return new PReLUPlugin(&mWeights, 1); } IPlugin* pluginFactory::createPlugin(const char* layerName, const Weights* serialData, int nbWeights) override { return new PReLUPlugin(serialData, serialLength); }
  • 30. PluginFactory parserPluginFactory; parser->setPluginFactoryExt(&parserPluginFactory); const IBlobNameToTensor *blobNameToTensor = parser->parse(gParams.deployFile.c_str(), // caffe deploy file gParams.modelFile.c_str(), // caffe model file *network, // network definition that the parser will populate gParams.fp16 ? DataType::kHALF : DataType::kFLOAT);
  • 32. void PReLUPlugin::serialize(void *buffer) { char *d = static_cast<char *>(buffer), *a = d; write(d, mNbInputChannels); write(d, mNbInputHeight); write(d, mNbInputWidth); write(d, mNbInputCount); write(d, mChannelShared); write(d, mWeights.count); write(d, mWeights.type); convertAndCopyToBuffer(d, mWeights); assert(d == a + getSerializationSize()); }
  • 33. PReLUPlugin::PReLUPlugin(const void *data, size_t length) { const char *d = static_cast<const char *>(data), *a = d; read<int>(d, mNbInputChannels); read<int>(d, mNbInputHeight); read<int>(d, mNbInputWidth); read<int>(d, mNbInputCount); read<bool>(d, mChannelShared); read<int64_t>(d, mWeights.count); read<DataType>(d, mWeights.type); mWeights.values = malloc(mWeights.count * type2size(mWeights.type)); memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type)); deserializeToDevice(d, mDeviceKernel, mWeights.count * type2size(mWeights.type)); assert(d == a + length); }
  • 34. Iplugin *PluginFactory::createPlugin(const char *layerName, const void *serialData, size_t serialLength) override { return new PReLUPlugin(serialData, serialLength); }
  • 35. PluginFactory pluginFactory; engine = infer->deserializeCudaEngine(trt_plan_file, size, &pluginFactory);
  • 36. cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream); context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); cudaStreamCreate(&stream)); IExecutionContext* context = engine->createExecutionContext();