SlideShare a Scribd company logo
1 of 33
Download to read offline
> me
$name
[1] "Takashi Kitano"
$twitter
[1] "@kashitan"
$work_in
[1] " "
前回の神会




> head(talks_en)
# A tibble: 6 x 2
title_en transcript_en
<chr> <chr>
1 Fake videos of real people — and how to … Look at these images. Now…
2 How to build synthetic DNA and send it a… "Alright, let me tell you…
3 Technology that knows what you're feeling "What happens when techno…
4 How to get empowered, not overpowered, b… "After 13.8 billion years…
5 Where joy hides and how to find it "It's 2008, and I'm just …
6 ""You Found Me"" (Cello music starts) You …
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
> talks_en %>% tidytext::unnest_tokens(word, transcript_en)
# A tibble: 9,840 x 2
title_en word
<chr> <chr>
1 Fake videos of real people — and how to spot them look
2 Fake videos of real people — and how to spot them at
3 Fake videos of real people — and how to spot them these
4 Fake videos of real people — and how to spot them images
5 Fake videos of real people — and how to spot them now
6 Fake videos of real people — and how to spot them tell
7 Fake videos of real people — and how to spot them me
8 Fake videos of real people — and how to spot them which
9 Fake videos of real people — and how to spot them obama
10 Fake videos of real people — and how to spot them here
> talks_ja %>% head()
# A tibble: 6 x 2
title_ja transcript_ja
<chr> <chr>
1 … …
2 DNA … …
3 …
4 AI AI … 138 …
5 2008 …
6 You Found Me … …
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
からの罠
> talks_ja %>% tidytext::unnest_tokens(word, transcript_ja)
# A tibble: 6,266,182 x 2
title_ja word
<chr> <chr>
1
2
3
4
5
6
# ... with 6,266,172 more rows
> mecab_result <- talks_ja %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 1
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
> mecab_result <- talks_ja %>%
+ as.data.frame() %>%
+ RMeCabDF("transcript_ja", 1)
> glimpse(mecab_result)
List of 2551
$ : Named chr [1:1445] " " " " " " " " ...
..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
$ : Named chr [1:2903] " " " " " " " " ...
..- attr(*, "names")= chr [1:2903] " " " " " " " " ...
$ : Named chr [1:2208] " " " " " " " " ...
..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
> # tibble 1 tibble
> class(talks_ja[, "transcript_ja"])
[1] "tbl_df" "tbl" "data.frame"
> # 1
> length(talks_ja[, "transcript_ja"])
[1] 1
> # data.frame 1
> class(as.data.frame(talks_ja)[, "transcript_ja"])
[1] "character"
> #
> length(as.data.frame(talks_ja)[, "transcript_ja"])
[1] 2551
> tokens_ja <- purrr::pmap_df(list(nv = mecab_result,
+ title = talks_ja$title_ja),
+ function(nv, title){
+ tibble(title = title,
+ word = nv,
+ hinshi = names(nv))
+ })
> tokens_ja
# A tibble: 6,483,469 x 3
title word hinshi
<chr> <chr> <chr>
1
2
3
4
5
6
# ... with 6,483,463 more rows
> bigram_en <- talks_en %>% select(title_en, transcript_en) %>%
+ tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams",
n = 2)
> head(bigram_en)
# A tibble: 6 x 2
title_en bigram
<chr> <chr>
1 ""(Nothing But) Flowers" with string quartet" music here
2 ""(Nothing But) Flowers" with string quartet" here we
3 ""(Nothing But) Flowers" with string quartet" we stand
4 ""(Nothing But) Flowers" with string quartet" stand like
5 ""(Nothing But) Flowers" with string quartet" like an
6 ""(Nothing But) Flowers" with string quartet" an adam
> bigram_ja <- talks_ja %>%
+ as.data.frame() %>%
+ docDF(col = "transcript_ja", type=1, N = 2)
number of extracted terms = 898167
now making a data frame. wait a while!
> bigram_ja.bk %>%
+ select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7,
Row8, Row9) %>%
+ head()
TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9
1 !-( - 0 0 0 0 0 0 0 0 0
2 !-7 - 0 0 0 0 0 0 0 0 0
3 !-Google - 0 0 0 0 0 0 0 0 0
4 !-Little - 0 0 0 0 0 0 0 0 0
5 !-Time - 0 0 0 0 0 0 0 0 0
6 !-Toonchi - 0 0 0 0 0 0 0 0 0
> bigram_ja <- tokens_ja %>%
+ group_by(title) %>%
+ rename(word1 = word,
+ hinshi1 = hinshi) %>%
+ mutate(word2 = lead(word1),
+ hinshi2 = lead(hinshi1)) %>%
+ ungroup() %>%
+ filter(!is.na(word2)) %>%
+ select(title, word1, word2, hinshi1, hinshi2)
> bigram_ja
# A tibble: 6,480,920 x 5
title word1 word2 hinshi1 hinshi2
<chr> <chr> <chr> <chr> <chr>
1
2
3
4
5
6
7
8
# ... with 6,480,912 more rows




{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

More Related Content

What's hot

モデル予見制御に基づくペアトレード戦略
モデル予見制御に基づくペアトレード戦略モデル予見制御に基づくペアトレード戦略
モデル予見制御に基づくペアトレード戦略Kei Nakagawa
 
負の二項分布について
負の二項分布について負の二項分布について
負の二項分布についてHiroshi Shimizu
 
{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析Takashi Kitano
 
Rustに触れて私のPythonはどう変わったか
Rustに触れて私のPythonはどう変わったかRustに触れて私のPythonはどう変わったか
Rustに触れて私のPythonはどう変わったかShunsukeNakamura17
 
機械学習を使った時系列売上予測
機械学習を使った時系列売上予測機械学習を使った時系列売上予測
機械学習を使った時系列売上予測DataRobotJP
 
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR #11
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR  #11「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR  #11
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR #11Koichi Hamada
 
Prophet入門【理論編】Facebookの時系列予測ツール
Prophet入門【理論編】Facebookの時系列予測ツールProphet入門【理論編】Facebookの時系列予測ツール
Prophet入門【理論編】Facebookの時系列予測ツールhoxo_m
 
Statistical Semantic入門 ~分布仮説からword2vecまで~
Statistical Semantic入門 ~分布仮説からword2vecまで~Statistical Semantic入門 ~分布仮説からword2vecまで~
Statistical Semantic入門 ~分布仮説からword2vecまで~Yuya Unno
 
関数データ解析の概要とその方法
関数データ解析の概要とその方法関数データ解析の概要とその方法
関数データ解析の概要とその方法Hidetoshi Matsui
 
2 6.ゼロ切断・過剰モデル
2 6.ゼロ切断・過剰モデル2 6.ゼロ切断・過剰モデル
2 6.ゼロ切断・過剰モデルlogics-of-blue
 
はじめてのShiny
はじめてのShinyはじめてのShiny
はじめてのShinyKazuya Wada
 
最適化超入門
最適化超入門最適化超入門
最適化超入門Takami Sato
 
統計的因果推論への招待 -因果構造探索を中心に-
統計的因果推論への招待 -因果構造探索を中心に-統計的因果推論への招待 -因果構造探索を中心に-
統計的因果推論への招待 -因果構造探索を中心に-Shiga University, RIKEN
 
社会心理学者のための時系列分析入門_小森
社会心理学者のための時系列分析入門_小森社会心理学者のための時系列分析入門_小森
社会心理学者のための時系列分析入門_小森Masashi Komori
 
3分でわかる多項分布とディリクレ分布
3分でわかる多項分布とディリクレ分布3分でわかる多項分布とディリクレ分布
3分でわかる多項分布とディリクレ分布Junya Saito
 
ゼロから始める自然言語処理 【FIT2016チュートリアル】
ゼロから始める自然言語処理 【FIT2016チュートリアル】ゼロから始める自然言語処理 【FIT2016チュートリアル】
ゼロから始める自然言語処理 【FIT2016チュートリアル】Yuki Arase
 

What's hot (20)

モデル予見制御に基づくペアトレード戦略
モデル予見制御に基づくペアトレード戦略モデル予見制御に基づくペアトレード戦略
モデル予見制御に基づくペアトレード戦略
 
負の二項分布について
負の二項分布について負の二項分布について
負の二項分布について
 
{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析{tidygraph}と{ggraph}によるモダンなネットワーク分析
{tidygraph}と{ggraph}によるモダンなネットワーク分析
 
Stan超初心者入門
Stan超初心者入門Stan超初心者入門
Stan超初心者入門
 
R seminar on igraph
R seminar on igraphR seminar on igraph
R seminar on igraph
 
Rustに触れて私のPythonはどう変わったか
Rustに触れて私のPythonはどう変わったかRustに触れて私のPythonはどう変わったか
Rustに触れて私のPythonはどう変わったか
 
機械学習を使った時系列売上予測
機械学習を使った時系列売上予測機械学習を使った時系列売上予測
機械学習を使った時系列売上予測
 
一般化線形モデル (GLM) & 一般化加法モデル(GAM)
一般化線形モデル (GLM) & 一般化加法モデル(GAM) 一般化線形モデル (GLM) & 一般化加法モデル(GAM)
一般化線形モデル (GLM) & 一般化加法モデル(GAM)
 
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR #11
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR  #11「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR  #11
「R言語による Random Forest 徹底入門 -集団学習による分類・予測-」 - #TokyoR #11
 
Prophet入門【理論編】Facebookの時系列予測ツール
Prophet入門【理論編】Facebookの時系列予測ツールProphet入門【理論編】Facebookの時系列予測ツール
Prophet入門【理論編】Facebookの時系列予測ツール
 
Statistical Semantic入門 ~分布仮説からword2vecまで~
Statistical Semantic入門 ~分布仮説からword2vecまで~Statistical Semantic入門 ~分布仮説からword2vecまで~
Statistical Semantic入門 ~分布仮説からword2vecまで~
 
関数データ解析の概要とその方法
関数データ解析の概要とその方法関数データ解析の概要とその方法
関数データ解析の概要とその方法
 
2 6.ゼロ切断・過剰モデル
2 6.ゼロ切断・過剰モデル2 6.ゼロ切断・過剰モデル
2 6.ゼロ切断・過剰モデル
 
はじめてのShiny
はじめてのShinyはじめてのShiny
はじめてのShiny
 
Rで学ぶロバスト推定
Rで学ぶロバスト推定Rで学ぶロバスト推定
Rで学ぶロバスト推定
 
最適化超入門
最適化超入門最適化超入門
最適化超入門
 
統計的因果推論への招待 -因果構造探索を中心に-
統計的因果推論への招待 -因果構造探索を中心に-統計的因果推論への招待 -因果構造探索を中心に-
統計的因果推論への招待 -因果構造探索を中心に-
 
社会心理学者のための時系列分析入門_小森
社会心理学者のための時系列分析入門_小森社会心理学者のための時系列分析入門_小森
社会心理学者のための時系列分析入門_小森
 
3分でわかる多項分布とディリクレ分布
3分でわかる多項分布とディリクレ分布3分でわかる多項分布とディリクレ分布
3分でわかる多項分布とディリクレ分布
 
ゼロから始める自然言語処理 【FIT2016チュートリアル】
ゼロから始める自然言語処理 【FIT2016チュートリアル】ゼロから始める自然言語処理 【FIT2016チュートリアル】
ゼロから始める自然言語処理 【FIT2016チュートリアル】
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析

PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPlotly
 
pa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processingpa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text ProcessingRodrigo Senra
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出すTakashi Kitano
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPaweł Dawczak
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesMatt Harrison
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonMoses Boudourides
 
Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em PythonDenis Costa
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadiesAlicia Pérez
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Pythonpugpe
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01Raman Kannan
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfannikasarees
 
Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Johan Blomme
 
Helvetia
HelvetiaHelvetia
HelvetiaESUG
 
Easy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraEasy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraBarry DeCicco
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析Takeshi Arabiki
 

Similar to {tidytext}と{RMeCab}によるモダンな日本語テキスト分析 (20)

PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of WranglingPLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
PLOTCON NYC: Behind Every Great Plot There's a Great Deal of Wrangling
 
pa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processingpa-pe-pi-po-pure Python Text Processing
pa-pe-pi-po-pure Python Text Processing
 
令和から本気出す
令和から本気出す令和から本気出す
令和から本気出す
 
Pre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to ElixirPre-Bootcamp introduction to Elixir
Pre-Bootcamp introduction to Elixir
 
Learn 90% of Python in 90 Minutes
Learn 90% of Python in 90 MinutesLearn 90% of Python in 90 Minutes
Learn 90% of Python in 90 Minutes
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Derrubando mitos em Python
Derrubando mitos em PythonDerrubando mitos em Python
Derrubando mitos em Python
 
Elixir
ElixirElixir
Elixir
 
Beautiful python - PyLadies
Beautiful python - PyLadiesBeautiful python - PyLadies
Beautiful python - PyLadies
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
 
R programming language
R programming languageR programming language
R programming language
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
 
Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1Text mining and social network analysis of twitter data part 1
Text mining and social network analysis of twitter data part 1
 
Python 1
Python 1Python 1
Python 1
 
Py ohio
Py ohioPy ohio
Py ohio
 
Helvetia
HelvetiaHelvetia
Helvetia
 
Easy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtraEasy HTML Tables in RStudio with Tabyl and kableExtra
Easy HTML Tables in RStudio with Tabyl and kableExtra
 
RではじめるTwitter解析
RではじめるTwitter解析RではじめるTwitter解析
RではじめるTwitter解析
 
Delete statement in PHP
Delete statement in PHPDelete statement in PHP
Delete statement in PHP
 

More from Takashi Kitano

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜Takashi Kitano
 
{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発TipsTakashi Kitano
 
20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門Takashi Kitano
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習Takashi Kitano
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜Takashi Kitano
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析Takashi Kitano
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開verTakashi Kitano
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyorTakashi Kitano
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門Takashi Kitano
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめTakashi Kitano
 

More from Takashi Kitano (12)

好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
好みの日本酒を呑みたい! 〜さけのわデータで探す自分好みの酒〜
 
{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips{shiny}と{leaflet}による地図アプリ開発Tips
{shiny}と{leaflet}による地図アプリ開発Tips
 
20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門20170923 excelユーザーのためのr入門
20170923 excelユーザーのためのr入門
 
mxnetで頑張る深層学習
mxnetで頑張る深層学習mxnetで頑張る深層学習
mxnetで頑張る深層学習
 
可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜可視化周辺の進化がヤヴァイ 〜2016〜
可視化周辺の進化がヤヴァイ 〜2016〜
 
Rによるウイスキー分析
Rによるウイスキー分析Rによるウイスキー分析
Rによるウイスキー分析
 
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver20160311 基礎からのベイズ統計学輪読会第6章 公開ver
20160311 基礎からのベイズ統計学輪読会第6章 公開ver
 
20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor20140625 rでのデータ分析(仮) for_tokyor
20140625 rでのデータ分析(仮) for_tokyor
 
lubridateパッケージ入門
lubridateパッケージ入門lubridateパッケージ入門
lubridateパッケージ入門
 
20150329 tokyo r47
20150329 tokyo r4720150329 tokyo r47
20150329 tokyo r47
 
20140920 tokyo r43
20140920 tokyo r4320140920 tokyo r43
20140920 tokyo r43
 
Google's r style guideのすゝめ
Google's r style guideのすゝめGoogle's r style guideのすゝめ
Google's r style guideのすゝめ
 

Recently uploaded

Unveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystUnveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystSamantha Rae Coolbeth
 
VidaXL dropshipping via API with DroFx.pptx
VidaXL dropshipping via API with DroFx.pptxVidaXL dropshipping via API with DroFx.pptx
VidaXL dropshipping via API with DroFx.pptxolyaivanovalion
 
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAl Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAroojKhan71
 
RA-11058_IRR-COMPRESS Do 198 series of 1998
RA-11058_IRR-COMPRESS Do 198 series of 1998RA-11058_IRR-COMPRESS Do 198 series of 1998
RA-11058_IRR-COMPRESS Do 198 series of 1998YohFuh
 
Edukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxEdukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxolyaivanovalion
 
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...Suhani Kapoor
 
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptx
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptxBPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptx
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptxMohammedJunaid861692
 
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Callshivangimorya083
 
Ravak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxRavak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxolyaivanovalion
 
Midocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxMidocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxolyaivanovalion
 
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Delhi Call girls
 
BigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxBigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxolyaivanovalion
 
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Callshivangimorya083
 
Smarteg dropshipping via API with DroFx.pptx
Smarteg dropshipping via API with DroFx.pptxSmarteg dropshipping via API with DroFx.pptx
Smarteg dropshipping via API with DroFx.pptxolyaivanovalion
 
Mature dropshipping via API with DroFx.pptx
Mature dropshipping via API with DroFx.pptxMature dropshipping via API with DroFx.pptx
Mature dropshipping via API with DroFx.pptxolyaivanovalion
 
Data-Analysis for Chicago Crime Data 2023
Data-Analysis for Chicago Crime Data  2023Data-Analysis for Chicago Crime Data  2023
Data-Analysis for Chicago Crime Data 2023ymrp368
 
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% SecureCall me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% SecurePooja Nehwal
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxolyaivanovalion
 
Ukraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSUkraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSAishani27
 
04242024_CCC TUG_Joins and Relationships
04242024_CCC TUG_Joins and Relationships04242024_CCC TUG_Joins and Relationships
04242024_CCC TUG_Joins and Relationshipsccctableauusergroup
 

Recently uploaded (20)

Unveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystUnveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data Analyst
 
VidaXL dropshipping via API with DroFx.pptx
VidaXL dropshipping via API with DroFx.pptxVidaXL dropshipping via API with DroFx.pptx
VidaXL dropshipping via API with DroFx.pptx
 
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAl Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
 
RA-11058_IRR-COMPRESS Do 198 series of 1998
RA-11058_IRR-COMPRESS Do 198 series of 1998RA-11058_IRR-COMPRESS Do 198 series of 1998
RA-11058_IRR-COMPRESS Do 198 series of 1998
 
Edukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFxEdukaciniai dropshipping via API with DroFx
Edukaciniai dropshipping via API with DroFx
 
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
 
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptx
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptxBPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptx
BPAC WITH UFSBI GENERAL PRESENTATION 18_05_2017-1.pptx
 
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls Punjabi Bagh 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
 
Ravak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptxRavak dropshipping via API with DroFx.pptx
Ravak dropshipping via API with DroFx.pptx
 
Midocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxMidocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFx
 
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
 
BigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxBigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptx
 
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
 
Smarteg dropshipping via API with DroFx.pptx
Smarteg dropshipping via API with DroFx.pptxSmarteg dropshipping via API with DroFx.pptx
Smarteg dropshipping via API with DroFx.pptx
 
Mature dropshipping via API with DroFx.pptx
Mature dropshipping via API with DroFx.pptxMature dropshipping via API with DroFx.pptx
Mature dropshipping via API with DroFx.pptx
 
Data-Analysis for Chicago Crime Data 2023
Data-Analysis for Chicago Crime Data  2023Data-Analysis for Chicago Crime Data  2023
Data-Analysis for Chicago Crime Data 2023
 
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% SecureCall me @ 9892124323  Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
Call me @ 9892124323 Cheap Rate Call Girls in Vashi with Real Photo 100% Secure
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptx
 
Ukraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSUkraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICS
 
04242024_CCC TUG_Joins and Relationships
04242024_CCC TUG_Joins and Relationships04242024_CCC TUG_Joins and Relationships
04242024_CCC TUG_Joins and Relationships
 

{tidytext}と{RMeCab}によるモダンな日本語テキスト分析

  • 1.
  • 2. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " "
  • 4.
  • 6.
  • 7. > head(talks_en) # A tibble: 6 x 2 title_en transcript_en <chr> <chr> 1 Fake videos of real people — and how to … Look at these images. Now… 2 How to build synthetic DNA and send it a… "Alright, let me tell you… 3 Technology that knows what you're feeling "What happens when techno… 4 How to get empowered, not overpowered, b… "After 13.8 billion years… 5 Where joy hides and how to find it "It's 2008, and I'm just … 6 ""You Found Me"" (Cello music starts) You …
  • 8. > talks_en %>% tidytext::unnest_tokens(word, transcript_en)
  • 9. > talks_en %>% tidytext::unnest_tokens(word, transcript_en) # A tibble: 9,840 x 2 title_en word <chr> <chr> 1 Fake videos of real people — and how to spot them look 2 Fake videos of real people — and how to spot them at 3 Fake videos of real people — and how to spot them these 4 Fake videos of real people — and how to spot them images 5 Fake videos of real people — and how to spot them now 6 Fake videos of real people — and how to spot them tell 7 Fake videos of real people — and how to spot them me 8 Fake videos of real people — and how to spot them which 9 Fake videos of real people — and how to spot them obama 10 Fake videos of real people — and how to spot them here
  • 10.
  • 11. > talks_ja %>% head() # A tibble: 6 x 2 title_ja transcript_ja <chr> <chr> 1 … … 2 DNA … … 3 … 4 AI AI … 138 … 5 2008 … 6 You Found Me … …
  • 12. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 14. > talks_ja %>% tidytext::unnest_tokens(word, transcript_ja) # A tibble: 6,266,182 x 2 title_ja word <chr> <chr> 1 2 3 4 5 6 # ... with 6,266,172 more rows
  • 15.
  • 16.
  • 17.
  • 18. > mecab_result <- talks_ja %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 1 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ...
  • 19. > mecab_result <- talks_ja %>% + as.data.frame() %>% + RMeCabDF("transcript_ja", 1) > glimpse(mecab_result) List of 2551 $ : Named chr [1:1445] " " " " " " " " ... ..- attr(*, "names")= chr [1:1445] " " " " " " " " ... $ : Named chr [1:2903] " " " " " " " " ... ..- attr(*, "names")= chr [1:2903] " " " " " " " " ... $ : Named chr [1:2208] " " " " " " " " ... ..- attr(*, "names")= chr [1:2208] " " " " " " " " ...
  • 20. > # tibble 1 tibble > class(talks_ja[, "transcript_ja"]) [1] "tbl_df" "tbl" "data.frame" > # 1 > length(talks_ja[, "transcript_ja"]) [1] 1 > # data.frame 1 > class(as.data.frame(talks_ja)[, "transcript_ja"]) [1] "character" > # > length(as.data.frame(talks_ja)[, "transcript_ja"]) [1] 2551
  • 21. > tokens_ja <- purrr::pmap_df(list(nv = mecab_result, + title = talks_ja$title_ja), + function(nv, title){ + tibble(title = title, + word = nv, + hinshi = names(nv)) + })
  • 22. > tokens_ja # A tibble: 6,483,469 x 3 title word hinshi <chr> <chr> <chr> 1 2 3 4 5 6 # ... with 6,483,463 more rows
  • 23.
  • 24. > bigram_en <- talks_en %>% select(title_en, transcript_en) %>% + tidytext::unnest_tokens(bigram, transcript_en, token = "ngrams", n = 2) > head(bigram_en) # A tibble: 6 x 2 title_en bigram <chr> <chr> 1 ""(Nothing But) Flowers" with string quartet" music here 2 ""(Nothing But) Flowers" with string quartet" here we 3 ""(Nothing But) Flowers" with string quartet" we stand 4 ""(Nothing But) Flowers" with string quartet" stand like 5 ""(Nothing But) Flowers" with string quartet" like an 6 ""(Nothing But) Flowers" with string quartet" an adam
  • 25.
  • 26. > bigram_ja <- talks_ja %>% + as.data.frame() %>% + docDF(col = "transcript_ja", type=1, N = 2) number of extracted terms = 898167 now making a data frame. wait a while!
  • 27. > bigram_ja.bk %>% + select(TERM, POS1, Row1, Row2, Row3, Row4, Row5, Row6, Row7, Row8, Row9) %>% + head() TERM POS1 Row1 Row2 Row3 Row4 Row5 Row6 Row7 Row8 Row9 1 !-( - 0 0 0 0 0 0 0 0 0 2 !-7 - 0 0 0 0 0 0 0 0 0 3 !-Google - 0 0 0 0 0 0 0 0 0 4 !-Little - 0 0 0 0 0 0 0 0 0 5 !-Time - 0 0 0 0 0 0 0 0 0 6 !-Toonchi - 0 0 0 0 0 0 0 0 0
  • 28.
  • 29. > bigram_ja <- tokens_ja %>% + group_by(title) %>% + rename(word1 = word, + hinshi1 = hinshi) %>% + mutate(word2 = lead(word1), + hinshi2 = lead(hinshi1)) %>% + ungroup() %>% + filter(!is.na(word2)) %>% + select(title, word1, word2, hinshi1, hinshi2)
  • 30. > bigram_ja # A tibble: 6,480,920 x 5 title word1 word2 hinshi1 hinshi2 <chr> <chr> <chr> <chr> <chr> 1 2 3 4 5 6 7 8 # ... with 6,480,912 more rows
  • 31.