SlideShare une entreprise Scribd logo
1  sur  47
Télécharger pour lire hors ligne
Ruby
Robots

Daniel Cukier
@danicuki
                http://www.flickr.com/photos/flysi/183272970
Relatives


• spiders
• crawlers
• bots
Why robot?
http://www.flickr.com/photos/nhankamer/5016628611
require 'anemone'

Anemone.crawl(url) do |anemone|
  anemone.on_every_page do |page|
      puts page.url
  end
end                 http://www.cantora.mus.br/
                           http://www.cantora.mus.br/fotos
                           http://www.cantora.mus.br/?locale=en
                           http://www.cantora.mus.br/?locale=pt-BR
                           http://www.cantora.mus.br/musicas
                           http://www.cantora.mus.br/videos
                           http://www.cantora.mus.br/agenda
                           http://www.cantora.mus.br/novidades
                           http://www.cantora.mus.br/musicas/baixar
                           http://www.cantora.mus.br/visitors/baixar
                           http://www.cantora.mus.br/social
                           http://www.cantora.mus.br/fotos?locale=pt-BR
                           http://www.cantora.mus.br/musicas?locale=en
                           http://www.cantora.mus.br/fotos?locale=en
XPath
<html>
...
<div class="bla">
  <a>legal</a>
</div>
...
</html>




html_doc = Nokogiri::HTML(html)
info = html_doc.xpath(
  "//div[@class='bla']/a")
info.text
=> legal
XPath
<table id="super">   >> html_doc = Nokogiri::HTML(html)
  <tr>               >> info = html_doc.xpath(
    <td>L1C1</td>      "//table[@id='super']/tr")
    <td>L1C2</td>    >> info.size
                     => 3
  </tr>
  <tr>
                     >> info
    <td>L2C1</td>    => legal
    <td>L2C2</td>
  </tr>              >> info[0].xpath("td").size
  <tr>               => 2
    <td>L3C1</td>
    <td>L3C2</td>    >> info[2].xpath("td")[1].text
  </tr>              => "L3C2"
</table>
rest-client
ET
G

http://www.flickr.com/photos/amortize/766738216
http://www.flickr.com/photos/abbeychristine/223898960
Good bot

                                                /robots.txt


                                                User-agent: *
                                                Disallow:




http://www.flickr.com/photos/temily/5645585162
Ruby
Robots

Daniel Cukier
@danicuki
                http://www.flickr.com/photos/flysi/183272970
http://www.flickr.com/photos/nephelim/5632618462
maxRowsList=16
>>   body = RestClient.get(url)
 >>   json = JSON.parse(body)
 >>   content = json["Content"]
 >>   content.size
 =>   16
      AHA!!!
 http://.../artistas?maxRowsList=1600&filter=Recentes
 >>   body = RestClient.get(url)
 >>   json = JSON.parse(a)
 >>   content = json["Content"]
 >>   content.size
 =>   1600

http://.../artistas?maxRowsList=1600000&filter=Recentes
 >> content.size
 => 9154

       Bingo!!!
>> b["Content"].map {|c| c["ProfileUrl"]}
["caravella", "tomleite", "jeffersontavares", "rodrigoaraujo",
"jorgemendes", "bossapunk", "daviddepiro", "freetools", "ironia",
"tiagorosa", "outprofile", "lucianokoscky",
"bandateatraldecarona", "tlounge", "almanaque", "razzyoficial",
"cretinosecanalhas", "cincorios", "ninoantunes", "caiocorsalette",
"alinedelima", "thelio", "grupodomdesamba", "ladoz",
"alexandrepontes", "poeiradgua", "betimalu", "leonardobessa",
"kamaross", "marcusdocavaco", "atividadeinformal", "angelkeys",
"locojohn", "forcamusic", "tiaguinhoabreu", "marcelonegrao",
"jstonemghiphop", "uniaoglobal", "bandaefex", "severarock",
"manitu", "sasso", "kakka", "xsopretty", "belepoke", "caixaazul",
"wknd", "bandastarven", "bleiamusic", "3porcentoaocubo",
"lucianoterra", "hipnoia", "influencianegra", "bandaursamaior",
"mariafreitas", "jessejames", "vagnerrockxe", "stageo3",
"lemoneight", "innocence", "dinda", "marcelocapela",
"paulocamoeseoslusiadas", "magnussrock", "bandatheburk",
"mercantes", "bandaturnerock", "flaviasaolli", "tonysagga",
"thiagoponde", "centeio", "grupodeubranco", "bocadeleao",
"eusoueliascardan", "notoriaoficial", "planomasterrock", "rofgod",
"dreemonphc", "chicobrant", "osz", "bandalightspeed",
"cavernadenarnia", "sergiobenevenuto", "viniciusdeoliveira", ...]
email?
phone?
>> html = RestClient.get("http://.../robomacaco")
>> html_doc = Nokogiri::HTML(html)
>> info = html_doc.xpath("//span[@class='name']")
>> info.text
=> "robo-macaco@hotmail.com
RIO DE JANEIRO - RJ - Brasil
21 9675-0199
cookies



cookies = {}
c = "s_nr=12954999; s_v19=12978609471; ... __utmc=206845458"
cook = c.split(";").map {|i| i.strip.split("=")}
cook.each {|u| cookies[u[0]] = u[1]}

RestClient.get(url, :cookies => cookies)
Proxies
http://www.ip-adress.com/proxy_list
>> response = RestClient.get(url)
>> html_doc = Nokogiri::HTML(response)
>> table = html_doc.xpath("//table
[@class='proxylist']")
>> lines = table.children
>> lines.shift   # tira o cabeçalho
                     Text

        IP
>> lines[1].text
=> "208.52.144.55 document.write(":"+i+r+i+r)
anonymous proxy server-2 minutes ago United States"
<script type="text/javascript">
  z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0;
</script>
JAVASCRIPT
     =
   RUBY



     http://www.flickr.com/photos/drics/4266471776/
<script type="text/javascript">
       z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0;
     </script>


>>   script = html_doc.xpath("//script")[1]
>>   eval script.text
>>   z
=>   5
>>   i
=>   8
>> lines[1].text
=> "208.52.144.55 document.write(":"+i+r+i+r) anonymous
proxy server-2 minutes ago United States"


>> server = lines[1].text.split[0]
=> "208.52.144.55"


>>   digits = lines[1].text.split(")")[0].split("+")
=>   ["208.52.144.55document.write(":"", "i", "r", "i", "r"]
>>   digits.shift
>>   digits
=>   ["i", "r", "i", "r"]
>>   port = digits.map {|c| eval(c)}.join("")
=>   "8080"
                Voilà

RestClient.proxy = "http://#{server}:#{port}"
mechanize
agent = Mechanize.new
site = "http://www.cantora.mus.br"
page = agent.get("#{site}/baixar")
form = page.form
form['visitor[name]'] = 'daniel'
form['visitor[email]'] = "danicuki@gmail.com"
page = agent.submit(form)
tracks = page.links.select { |l| l.href =~ /track/ }
tracks.each do |t|
  file = agent.get("#{site}#{t})
  file.save
end
protection techniques




                     javascript
                  text as image
                        captcha
              don’t be ingenuous
captcha
prove you are not a robot




      YES you can!
3 steps

1. Download Image
2. filter image
3. run OCR software
scaling




http://www.flickr.com/photos/liquene/3330714590
clouds


$ knife ec2 server create
threads
   +
queues
Nessa vida de programador maluco
Me aparece cada situação
De repente um cliente, uma proposta bruta
Pra pegar de um site informação
Você tá louco, esse tipo de crime eu não faço
Se quiser tenho uns amigos lá do sul
Faz pra mim que eu te pago com essa jóia cool

Te dou um ruby
Pra você roubar
Com o seu robô

Quer fazer robô?
É só usar ruby
É só usar ruby
Pra fazer robô

                                http://www.flickr.com/photos/jobafunky/5572503988
Thank you




Daniel Cukier
@danicuki

Contenu connexe

Tendances

Mojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tMojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tCosimo Streppone
 
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍민태 김
 
PerlでWeb API入門
PerlでWeb API入門PerlでWeb API入門
PerlでWeb API入門Yusuke Wada
 
Java & Script ─ 清羽
Java & Script ─ 清羽Java & Script ─ 清羽
Java & Script ─ 清羽taobao.com
 
URL Resources
URL ResourcesURL Resources
URL Resourcestombecky
 
Token Based Authentication Systems
Token Based Authentication SystemsToken Based Authentication Systems
Token Based Authentication SystemsHüseyin BABAL
 
Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Angelo van der Sijpt
 
R57php 1231677414471772-2
R57php 1231677414471772-2R57php 1231677414471772-2
R57php 1231677414471772-2ady36
 
The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)Francois Marier
 
Persona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsPersona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsFrancois Marier
 
Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Oleg Zinchenko
 
Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Hidenori Fujimura
 
Angular js活用事例:filydoc
Angular js活用事例:filydocAngular js活用事例:filydoc
Angular js活用事例:filydocKeiichi Kobayashi
 
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat GroupInterlat
 
Beyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressBeyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressJohn Eckman
 

Tendances (20)

Mojolicious: what works and what doesn't
Mojolicious: what works and what doesn'tMojolicious: what works and what doesn't
Mojolicious: what works and what doesn't
 
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
H3 경쟁력있는 웹앱 개발을 위한 모바일 js 프레임웍
 
PerlでWeb API入門
PerlでWeb API入門PerlでWeb API入門
PerlでWeb API入門
 
Device deployment
Device deploymentDevice deployment
Device deployment
 
Java & Script ─ 清羽
Java & Script ─ 清羽Java & Script ─ 清羽
Java & Script ─ 清羽
 
URL Resources
URL ResourcesURL Resources
URL Resources
 
Token Based Authentication Systems
Token Based Authentication SystemsToken Based Authentication Systems
Token Based Authentication Systems
 
Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011Massive device deployment - EclipseCon 2011
Massive device deployment - EclipseCon 2011
 
R57php 1231677414471772-2
R57php 1231677414471772-2R57php 1231677414471772-2
R57php 1231677414471772-2
 
Rails by example
Rails by exampleRails by example
Rails by example
 
Pecha Kucha
Pecha KuchaPecha Kucha
Pecha Kucha
 
The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)The Web beyond "usernames & passwords" (OSDC12)
The Web beyond "usernames & passwords" (OSDC12)
 
Persona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwordsPersona: in your browsers, killing your passwords
Persona: in your browsers, killing your passwords
 
Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)Keep It Simple Security (Symfony cafe 28-01-2016)
Keep It Simple Security (Symfony cafe 28-01-2016)
 
Blog Hacks 2011
Blog Hacks 2011Blog Hacks 2011
Blog Hacks 2011
 
Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性
 
Angular js活用事例:filydoc
Angular js活用事例:filydocAngular js活用事例:filydoc
Angular js活用事例:filydoc
 
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
6.Conocimiento cliente Cuenta Pagos en Linea. (Interlat Group
 
API Design
API DesignAPI Design
API Design
 
Beyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPressBeyond Posts & Pages - Structured Content in WordPress
Beyond Posts & Pages - Structured Content in WordPress
 

Similaire à Ruby Robots: Guide to Web Scraping and Automation

Velocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youVelocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youPatrick Meenan
 
Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Security B-Sides
 
HTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionHTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionXavier Mertens
 
Inspec one tool to rule them all
Inspec one tool to rule them allInspec one tool to rule them all
Inspec one tool to rule them allKimball Johnson
 
Log files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesLog files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesRobin Rozhon
 
Using HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysUsing HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysRobert Nyman
 
Illuminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 TutorialIlluminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 Tutorialmikel_maron
 
#NewMeetup Performance
#NewMeetup Performance#NewMeetup Performance
#NewMeetup PerformanceJustin Cataldo
 
Microformats: what are they and why do I care?
Microformats: what are they and why do I care?Microformats: what are they and why do I care?
Microformats: what are they and why do I care?adactio
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-pythonEric Ahn
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-pythonEric Ahn
 
20111014 mu me_html5
20111014 mu me_html520111014 mu me_html5
20111014 mu me_html5Erik Duval
 
Keep it simple web development stack
Keep it simple web development stackKeep it simple web development stack
Keep it simple web development stackEric Ahn
 

Similaire à Ruby Robots: Guide to Web Scraping and Automation (20)

Velocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and youVelocity EU 2012 - Third party scripts and you
Velocity EU 2012 - Third party scripts and you
 
Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)Smashing the stats for fun (and profit)
Smashing the stats for fun (and profit)
 
Seti 09
Seti 09Seti 09
Seti 09
 
HTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC EditionHTTP For the Good or the Bad - FSEC Edition
HTTP For the Good or the Bad - FSEC Edition
 
Inspec one tool to rule them all
Inspec one tool to rule them allInspec one tool to rule them all
Inspec one tool to rule them all
 
Log files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO OpportunitiesLog files: The Overlooked Source of SEO Opportunities
Log files: The Overlooked Source of SEO Opportunities
 
Using HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech DaysUsing HTML5 For a Great Open Web - Valtech Tech Days
Using HTML5 For a Great Open Web - Valtech Tech Days
 
The Devil and HTML5
The Devil and HTML5The Devil and HTML5
The Devil and HTML5
 
Mume HTML5 Intro
Mume HTML5 IntroMume HTML5 Intro
Mume HTML5 Intro
 
Illuminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 TutorialIlluminated Hacks -- Where 2.0 101 Tutorial
Illuminated Hacks -- Where 2.0 101 Tutorial
 
ApacheCon 2005
ApacheCon 2005ApacheCon 2005
ApacheCon 2005
 
Jabber Bot
Jabber BotJabber Bot
Jabber Bot
 
Api
ApiApi
Api
 
#NewMeetup Performance
#NewMeetup Performance#NewMeetup Performance
#NewMeetup Performance
 
Microformats: what are they and why do I care?
Microformats: what are they and why do I care?Microformats: what are they and why do I care?
Microformats: what are they and why do I care?
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-python
 
Py conkr 20150829_docker-python
Py conkr 20150829_docker-pythonPy conkr 20150829_docker-python
Py conkr 20150829_docker-python
 
CEI Email 3.14.03
CEI Email 3.14.03CEI Email 3.14.03
CEI Email 3.14.03
 
20111014 mu me_html5
20111014 mu me_html520111014 mu me_html5
20111014 mu me_html5
 
Keep it simple web development stack
Keep it simple web development stackKeep it simple web development stack
Keep it simple web development stack
 

Plus de Daniel Cukier

Solidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingSolidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingDaniel Cukier
 
Spring e Injeção de Dependência
Spring e Injeção de DependênciaSpring e Injeção de Dependência
Spring e Injeção de DependênciaDaniel Cukier
 
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Daniel Cukier
 
Startup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityStartup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityDaniel Cukier
 
Technology Startups Ecosystem in China - Lessons to other ecosystems
Technology Startups  Ecosystem in China - Lessons to other ecosystemsTechnology Startups  Ecosystem in China - Lessons to other ecosystems
Technology Startups Ecosystem in China - Lessons to other ecosystemsDaniel Cukier
 
Software Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudySoftware Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudyDaniel Cukier
 
Maturity model for Startup Ecosystems
Maturity model for Startup EcosystemsMaturity model for Startup Ecosystems
Maturity model for Startup EcosystemsDaniel Cukier
 
Why Google Cloud is so special? Stories from a cloud user
Why Google Cloud is so special?  Stories from a cloud userWhy Google Cloud is so special?  Stories from a cloud user
Why Google Cloud is so special? Stories from a cloud userDaniel Cukier
 
Software Architectures for a Single Person Team
Software Architectures for a Single Person TeamSoftware Architectures for a Single Person Team
Software Architectures for a Single Person TeamDaniel Cukier
 
Introduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaIntroduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaDaniel Cukier
 
O dia a dia de uma Startup
O dia a dia de uma StartupO dia a dia de uma Startup
O dia a dia de uma StartupDaniel Cukier
 
Injeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsInjeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsDaniel Cukier
 
Selecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringSelecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringDaniel Cukier
 
Is Computer Science Science?
Is Computer Science Science?Is Computer Science Science?
Is Computer Science Science?Daniel Cukier
 
Better Science Through Art
Better Science Through ArtBetter Science Through Art
Better Science Through ArtDaniel Cukier
 
Designed as Designer
Designed as DesignerDesigned as Designer
Designed as DesignerDaniel Cukier
 
When Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesWhen Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesDaniel Cukier
 

Plus de Daniel Cukier (20)

Solidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate TrainingSolidity: Zero to Hero Corporate Training
Solidity: Zero to Hero Corporate Training
 
Spring e Injeção de Dependência
Spring e Injeção de DependênciaSpring e Injeção de Dependência
Spring e Injeção de Dependência
 
Pair programming
Pair programmingPair programming
Pair programming
 
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
Eficiency and Low Cost: Pro Tips for you to save 50% of your money with Googl...
 
Startup Communities: From Nascence to Maturity
Startup Communities: From Nascence to MaturityStartup Communities: From Nascence to Maturity
Startup Communities: From Nascence to Maturity
 
Technology Startups Ecosystem in China - Lessons to other ecosystems
Technology Startups  Ecosystem in China - Lessons to other ecosystemsTechnology Startups  Ecosystem in China - Lessons to other ecosystems
Technology Startups Ecosystem in China - Lessons to other ecosystems
 
Software Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case StudySoftware Startup Ecosystems Evolution - The New York City Case Study
Software Startup Ecosystems Evolution - The New York City Case Study
 
Maturity model for Startup Ecosystems
Maturity model for Startup EcosystemsMaturity model for Startup Ecosystems
Maturity model for Startup Ecosystems
 
Why Google Cloud is so special? Stories from a cloud user
Why Google Cloud is so special?  Stories from a cloud userWhy Google Cloud is so special?  Stories from a cloud user
Why Google Cloud is so special? Stories from a cloud user
 
Software Architectures for a Single Person Team
Software Architectures for a Single Person TeamSoftware Architectures for a Single Person Team
Software Architectures for a Single Person Team
 
Startup Communities
Startup CommunitiesStartup Communities
Startup Communities
 
Introduction to Functional Programming with Scala
Introduction to Functional Programming with ScalaIntroduction to Functional Programming with Scala
Introduction to Functional Programming with Scala
 
Play vs Rails
Play vs RailsPlay vs Rails
Play vs Rails
 
O dia a dia de uma Startup
O dia a dia de uma StartupO dia a dia de uma Startup
O dia a dia de uma Startup
 
Injeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com DublêsInjeção de Dependência e Testes com Dublês
Injeção de Dependência e Testes com Dublês
 
Selecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software EngineeringSelecting Empirical Methods for Software Engineering
Selecting Empirical Methods for Software Engineering
 
Is Computer Science Science?
Is Computer Science Science?Is Computer Science Science?
Is Computer Science Science?
 
Better Science Through Art
Better Science Through ArtBetter Science Through Art
Better Science Through Art
 
Designed as Designer
Designed as DesignerDesigned as Designer
Designed as Designer
 
When Should You Consider Meta Architectures
When Should You Consider Meta ArchitecturesWhen Should You Consider Meta Architectures
When Should You Consider Meta Architectures
 

Dernier

Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...shyamraj55
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersThousandEyes
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdfhans926745
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Servicegiselly40
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024Scott Keck-Warren
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Allon Mureinik
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024BookNet Canada
 
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...HostedbyConfluent
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptxHampshireHUG
 
Understanding the Laravel MVC Architecture
Understanding the Laravel MVC ArchitectureUnderstanding the Laravel MVC Architecture
Understanding the Laravel MVC ArchitecturePixlogix Infotech
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Miguel Araújo
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 3652toLead Limited
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityPrincipled Technologies
 
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | DelhiFULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhisoniya singh
 
Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesSinan KOZAK
 

Dernier (20)

Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
Automating Business Process via MuleSoft Composer | Bangalore MuleSoft Meetup...
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
Transcript: #StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
 
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...
Transforming Data Streams with Kafka Connect: An Introduction to Single Messa...
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
 
Understanding the Laravel MVC Architecture
Understanding the Laravel MVC ArchitectureUnderstanding the Laravel MVC Architecture
Understanding the Laravel MVC Architecture
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | DelhiFULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
 
Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen Frames
 

Ruby Robots: Guide to Web Scraping and Automation

  • 1. Ruby Robots Daniel Cukier @danicuki http://www.flickr.com/photos/flysi/183272970
  • 2.
  • 6. require 'anemone' Anemone.crawl(url) do |anemone| anemone.on_every_page do |page| puts page.url end end http://www.cantora.mus.br/ http://www.cantora.mus.br/fotos http://www.cantora.mus.br/?locale=en http://www.cantora.mus.br/?locale=pt-BR http://www.cantora.mus.br/musicas http://www.cantora.mus.br/videos http://www.cantora.mus.br/agenda http://www.cantora.mus.br/novidades http://www.cantora.mus.br/musicas/baixar http://www.cantora.mus.br/visitors/baixar http://www.cantora.mus.br/social http://www.cantora.mus.br/fotos?locale=pt-BR http://www.cantora.mus.br/musicas?locale=en http://www.cantora.mus.br/fotos?locale=en
  • 7.
  • 8. XPath <html> ... <div class="bla"> <a>legal</a> </div> ... </html> html_doc = Nokogiri::HTML(html) info = html_doc.xpath( "//div[@class='bla']/a") info.text => legal
  • 9. XPath <table id="super"> >> html_doc = Nokogiri::HTML(html) <tr> >> info = html_doc.xpath( <td>L1C1</td> "//table[@id='super']/tr") <td>L1C2</td> >> info.size => 3 </tr> <tr> >> info <td>L2C1</td> => legal <td>L2C2</td> </tr> >> info[0].xpath("td").size <tr> => 2 <td>L3C1</td> <td>L3C2</td> >> info[2].xpath("td")[1].text </tr> => "L3C2" </table>
  • 13. Good bot /robots.txt User-agent: * Disallow: http://www.flickr.com/photos/temily/5645585162
  • 14. Ruby Robots Daniel Cukier @danicuki http://www.flickr.com/photos/flysi/183272970
  • 16.
  • 18.
  • 19.
  • 20. >> body = RestClient.get(url) >> json = JSON.parse(body) >> content = json["Content"] >> content.size => 16 AHA!!! http://.../artistas?maxRowsList=1600&filter=Recentes >> body = RestClient.get(url) >> json = JSON.parse(a) >> content = json["Content"] >> content.size => 1600 http://.../artistas?maxRowsList=1600000&filter=Recentes >> content.size => 9154 Bingo!!!
  • 21. >> b["Content"].map {|c| c["ProfileUrl"]} ["caravella", "tomleite", "jeffersontavares", "rodrigoaraujo", "jorgemendes", "bossapunk", "daviddepiro", "freetools", "ironia", "tiagorosa", "outprofile", "lucianokoscky", "bandateatraldecarona", "tlounge", "almanaque", "razzyoficial", "cretinosecanalhas", "cincorios", "ninoantunes", "caiocorsalette", "alinedelima", "thelio", "grupodomdesamba", "ladoz", "alexandrepontes", "poeiradgua", "betimalu", "leonardobessa", "kamaross", "marcusdocavaco", "atividadeinformal", "angelkeys", "locojohn", "forcamusic", "tiaguinhoabreu", "marcelonegrao", "jstonemghiphop", "uniaoglobal", "bandaefex", "severarock", "manitu", "sasso", "kakka", "xsopretty", "belepoke", "caixaazul", "wknd", "bandastarven", "bleiamusic", "3porcentoaocubo", "lucianoterra", "hipnoia", "influencianegra", "bandaursamaior", "mariafreitas", "jessejames", "vagnerrockxe", "stageo3", "lemoneight", "innocence", "dinda", "marcelocapela", "paulocamoeseoslusiadas", "magnussrock", "bandatheburk", "mercantes", "bandaturnerock", "flaviasaolli", "tonysagga", "thiagoponde", "centeio", "grupodeubranco", "bocadeleao", "eusoueliascardan", "notoriaoficial", "planomasterrock", "rofgod", "dreemonphc", "chicobrant", "osz", "bandalightspeed", "cavernadenarnia", "sergiobenevenuto", "viniciusdeoliveira", ...]
  • 23.
  • 24. >> html = RestClient.get("http://.../robomacaco") >> html_doc = Nokogiri::HTML(html) >> info = html_doc.xpath("//span[@class='name']") >> info.text => "robo-macaco@hotmail.com RIO DE JANEIRO - RJ - Brasil 21 9675-0199
  • 25.
  • 26. cookies cookies = {} c = "s_nr=12954999; s_v19=12978609471; ... __utmc=206845458" cook = c.split(";").map {|i| i.strip.split("=")} cook.each {|u| cookies[u[0]] = u[1]} RestClient.get(url, :cookies => cookies)
  • 29.
  • 30.
  • 31.
  • 32. >> response = RestClient.get(url) >> html_doc = Nokogiri::HTML(response) >> table = html_doc.xpath("//table [@class='proxylist']") >> lines = table.children >> lines.shift # tira o cabeçalho Text IP >> lines[1].text => "208.52.144.55 document.write(":"+i+r+i+r) anonymous proxy server-2 minutes ago United States"
  • 34. JAVASCRIPT = RUBY http://www.flickr.com/photos/drics/4266471776/
  • 35. <script type="text/javascript"> z=5;i=8;x=4;l=1;o=9;q=6;n=3;u=2;k=7;r=0; </script> >> script = html_doc.xpath("//script")[1] >> eval script.text >> z => 5 >> i => 8
  • 36. >> lines[1].text => "208.52.144.55 document.write(":"+i+r+i+r) anonymous proxy server-2 minutes ago United States" >> server = lines[1].text.split[0] => "208.52.144.55" >> digits = lines[1].text.split(")")[0].split("+") => ["208.52.144.55document.write(":"", "i", "r", "i", "r"] >> digits.shift >> digits => ["i", "r", "i", "r"] >> port = digits.map {|c| eval(c)}.join("") => "8080" Voilà RestClient.proxy = "http://#{server}:#{port}"
  • 37. mechanize agent = Mechanize.new site = "http://www.cantora.mus.br" page = agent.get("#{site}/baixar") form = page.form form['visitor[name]'] = 'daniel' form['visitor[email]'] = "danicuki@gmail.com" page = agent.submit(form) tracks = page.links.select { |l| l.href =~ /track/ } tracks.each do |t| file = agent.get("#{site}#{t}) file.save end
  • 38. protection techniques javascript text as image captcha don’t be ingenuous
  • 39. captcha prove you are not a robot YES you can!
  • 40. 3 steps 1. Download Image 2. filter image 3. run OCR software
  • 41.
  • 43. clouds $ knife ec2 server create
  • 44. threads + queues
  • 45.
  • 46. Nessa vida de programador maluco Me aparece cada situação De repente um cliente, uma proposta bruta Pra pegar de um site informação Você tá louco, esse tipo de crime eu não faço Se quiser tenho uns amigos lá do sul Faz pra mim que eu te pago com essa jóia cool Te dou um ruby Pra você roubar Com o seu robô Quer fazer robô? É só usar ruby É só usar ruby Pra fazer robô http://www.flickr.com/photos/jobafunky/5572503988