SlideShare une entreprise Scribd logo
1  sur  4
Télécharger pour lire hors ligne
import pandas as pd
d = pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
dvec = vectorizer.fit_transform(d['CONTENT'])
dshuf = d.sample(frac=1)
d_train = dshuf[:300]
d_test = dshuf[300:]
d_train_att = vectorizer.fit_transform(d_train['CONTENT']) # fit bag-of-words on training set
d_test_att = vectorizer.transform(d_test['CONTENT']) # reuse on testing set
d_train_label = d_train['CLASS']
d_test_label = d_test['CLASS']
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=80)
clf.fit(d_train_att, d_train_label)
print(clf.score(d_test_att, d_test_label))
from sklearn.metrics import confusion_matrix
pred_labels = clf.predict(d_test_att)
print(confusion_matrix(d_test_label, pred_labels))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, d_train_att, d_train_label, cv=5)
# show average score and +/- two standard deviations away (covering 95% of scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# load all datasets and combine them
d = pd.concat([pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube04-Eminem.csv"),
pd.read_csv("YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")])
dshuf = d.sample(frac=1)
d_content = dshuf['CONTENT']
d_label = dshuf['CLASS']
# set up a pipeline
from sklearn.pipeline import Pipeline, make_pipeline
pipeline = Pipeline([
('bag-of-words', CountVectorizer()),
('random forest', RandomForestClassifier()),
])
pipeline
pipeline.fit(d_content[:1500],d_label[:1500])
print(pipeline.score(d_content[1500:], d_label[1500:]))
print(pipeline.predict(["what a neat video!"]))
print(pipeline.predict(["plz subscribe to my channel"]))
scores = cross_val_score(pipeline, d_content, d_label, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# add tfidf
from sklearn.feature_extraction.text import TfidfTransformer
pipeline2 = make_pipeline(CountVectorizer(),
TfidfTransformer(norm=None),
RandomForestClassifier())
scores = cross_val_score(pipeline2, d_content, d_label, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# parameter search
parameters = {
'countvectorizer__max_features': (None, 1000, 2000),
'countvectorizer__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'countvectorizer__stop_words': ('english', None),
'tfidftransformer__use_idf': (True, False), # effectively turn on/off tfidf
'randomforestclassifier__n_estimators': (20, 50, 100)
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1)
grid_search.fit(d_content, d_label)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("t%s: %r" % (param_name, best_parameters[param_name]))
Detect spam comments youtube videos and app store reviews

Contenu connexe

Similaire à Detect spam comments youtube videos and app store reviews

I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
MattU5mLambertq
 
Python Unit Test
Python Unit TestPython Unit Test
Python Unit Test
David Xie
 
Creating and Maintaining WordPress Plugins
Creating and Maintaining WordPress PluginsCreating and Maintaining WordPress Plugins
Creating and Maintaining WordPress Plugins
Mark Jaquith
 
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdfHello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Ian0J2Bondo
 
Classification examp
Classification exampClassification examp
Classification examp
Ryan Hong
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
GordonF2XPatersonh
 

Similaire à Detect spam comments youtube videos and app store reviews (20)

Assignment 5.2.pdf
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
 
Assignment 6.2a.pdf
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
 
Testing My Patience
Testing My PatienceTesting My Patience
Testing My Patience
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
 
Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019Baby Steps to Machine Learning at DevFest Lagos 2019
Baby Steps to Machine Learning at DevFest Lagos 2019
 
Python Unit Test
Python Unit TestPython Unit Test
Python Unit Test
 
Django (Web Konferencia 2009)
Django (Web Konferencia 2009)Django (Web Konferencia 2009)
Django (Web Konferencia 2009)
 
Grid search (parameter tuning)
Grid search (parameter tuning)Grid search (parameter tuning)
Grid search (parameter tuning)
 
K fold
K foldK fold
K fold
 
Naive application of Machine Learning to Software Development
Naive application of Machine Learning to Software DevelopmentNaive application of Machine Learning to Software Development
Naive application of Machine Learning to Software Development
 
Unit test
Unit testUnit test
Unit test
 
Creating and Maintaining WordPress Plugins
Creating and Maintaining WordPress PluginsCreating and Maintaining WordPress Plugins
Creating and Maintaining WordPress Plugins
 
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit TestingPython Ireland Nov 2010 Talk: Unit Testing
Python Ireland Nov 2010 Talk: Unit Testing
 
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdfHello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
Hello- I hope you are doing well- I am doing my project- which is Rans (1).pdf
 
Classification examp
Classification exampClassification examp
Classification examp
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
Ml all programs
Ml all programsMl all programs
Ml all programs
 
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdfI want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
I want you to add the output of the F1 score- Precision- ROC AUC- and.pdf
 
Python programming : Classes objects
Python programming : Classes objectsPython programming : Classes objects
Python programming : Classes objects
 
Machine Learning Algorithms
Machine Learning AlgorithmsMachine Learning Algorithms
Machine Learning Algorithms
 

Plus de Mamoon Ismail Khalid

T(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdfT(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdf
Mamoon Ismail Khalid
 
Golf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction SystemGolf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction System
Mamoon Ismail Khalid
 
24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf
Mamoon Ismail Khalid
 

Plus de Mamoon Ismail Khalid (20)

Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...Hospital Management and Inventory Control Solution for Public Hospitals in De...
Hospital Management and Inventory Control Solution for Public Hospitals in De...
 
ATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdfATLAS - Product Requirement Document.pdf
ATLAS - Product Requirement Document.pdf
 
T(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdfT(X) Innoway - Prediction Algorithm design.pdf
T(X) Innoway - Prediction Algorithm design.pdf
 
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...Joint3DShapeMatching  - a fast approach to 3D model matching using MatchALS 3...
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
 
Golf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction SystemGolf Swing Analysis and Posture Correction System
Golf Swing Analysis and Posture Correction System
 
24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf24 ideas to revive any developing country.pdf
24 ideas to revive any developing country.pdf
 
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
 
PyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdfPyTorch to detect Humans Eating Food.pdf
PyTorch to detect Humans Eating Food.pdf
 
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyondFuture of agriculture agriculture - technology is a necessity in 2020 and beyond
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
 
Nano mos25
Nano mos25Nano mos25
Nano mos25
 
Real estate in blockchain (2)
Real estate in blockchain (2)Real estate in blockchain (2)
Real estate in blockchain (2)
 
Cohort analysis saa s (1)
Cohort analysis saa s (1)Cohort analysis saa s (1)
Cohort analysis saa s (1)
 
ISA backed technology skills platform
ISA backed technology skills platformISA backed technology skills platform
ISA backed technology skills platform
 
Start up valuation methods
Start up valuation methodsStart up valuation methods
Start up valuation methods
 
Analysis mvp factory
Analysis mvp factoryAnalysis mvp factory
Analysis mvp factory
 
Start Up deal/interaction management workflow
Start Up deal/interaction management workflowStart Up deal/interaction management workflow
Start Up deal/interaction management workflow
 
Hunter.io scraper
Hunter.io scraperHunter.io scraper
Hunter.io scraper
 
Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)Interact with information on largest 100 btc accounts ( python skeleton code)
Interact with information on largest 100 btc accounts ( python skeleton code)
 
LinkedIn profile Public information Scraper
LinkedIn profile Public information ScraperLinkedIn profile Public information Scraper
LinkedIn profile Public information Scraper
 
Twitter sentiment analysis for cryptoassets
Twitter sentiment analysis for cryptoassets Twitter sentiment analysis for cryptoassets
Twitter sentiment analysis for cryptoassets
 

Dernier

Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
Joaquim Jorge
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
giselly40
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
vu2urc
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
Enterprise Knowledge
 

Dernier (20)

Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
Evaluating the top large language models.pdf
Evaluating the top large language models.pdfEvaluating the top large language models.pdf
Evaluating the top large language models.pdf
 
Strategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherStrategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a Fresher
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
Bajaj Allianz Life Insurance Company - Insurer Innovation Award 2024
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 
Tech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdfTech Trends Report 2024 Future Today Institute.pdf
Tech Trends Report 2024 Future Today Institute.pdf
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdf
 

Detect spam comments youtube videos and app store reviews

  • 1. import pandas as pd d = pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv") from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() dvec = vectorizer.fit_transform(d['CONTENT']) dshuf = d.sample(frac=1) d_train = dshuf[:300] d_test = dshuf[300:] d_train_att = vectorizer.fit_transform(d_train['CONTENT']) # fit bag-of-words on training set d_test_att = vectorizer.transform(d_test['CONTENT']) # reuse on testing set d_train_label = d_train['CLASS'] d_test_label = d_test['CLASS'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=80) clf.fit(d_train_att, d_train_label) print(clf.score(d_test_att, d_test_label)) from sklearn.metrics import confusion_matrix pred_labels = clf.predict(d_test_att) print(confusion_matrix(d_test_label, pred_labels)) from sklearn.model_selection import cross_val_score scores = cross_val_score(clf, d_train_att, d_train_label, cv=5)
  • 2. # show average score and +/- two standard deviations away (covering 95% of scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # load all datasets and combine them d = pd.concat([pd.read_csv("YouTube-Spam-Collection-v1/Youtube01-Psy.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube04-Eminem.csv"), pd.read_csv("YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")]) dshuf = d.sample(frac=1) d_content = dshuf['CONTENT'] d_label = dshuf['CLASS'] # set up a pipeline from sklearn.pipeline import Pipeline, make_pipeline pipeline = Pipeline([ ('bag-of-words', CountVectorizer()), ('random forest', RandomForestClassifier()), ]) pipeline pipeline.fit(d_content[:1500],d_label[:1500]) print(pipeline.score(d_content[1500:], d_label[1500:])) print(pipeline.predict(["what a neat video!"])) print(pipeline.predict(["plz subscribe to my channel"]))
  • 3. scores = cross_val_score(pipeline, d_content, d_label, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # add tfidf from sklearn.feature_extraction.text import TfidfTransformer pipeline2 = make_pipeline(CountVectorizer(), TfidfTransformer(norm=None), RandomForestClassifier()) scores = cross_val_score(pipeline2, d_content, d_label, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # parameter search parameters = { 'countvectorizer__max_features': (None, 1000, 2000), 'countvectorizer__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams 'countvectorizer__stop_words': ('english', None), 'tfidftransformer__use_idf': (True, False), # effectively turn on/off tfidf 'randomforestclassifier__n_estimators': (20, 50, 100) } from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1) grid_search.fit(d_content, d_label) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("t%s: %r" % (param_name, best_parameters[param_name]))