SlideShare une entreprise Scribd logo
1  sur  23
Télécharger pour lire hors ligne
All YOUR PAGE ARE BELONG TO US
    すべてのウェブページをこの手に

            2012/11/16

           株式会社はてな
          大西康裕 id:onishi
  id:onishi 大西康裕
  ONISHI
  @yasuhiro_onishi
  株式会社はてな
  はてなブログ
ウェブページを
保存したい
ウェブページを保存したい
 •ウェブページは日々変化する
 •手元に置いておきたい
 •競合調査
 • 魚拓

 •画像などまとめて保存したい
Google
Chrome
HTML::Parser
my $result;
my $parser    = HTML::Parser->new(
    start_h      => [ sub {}, 'self,tagname,attr,text' ],
    default_h    => [ sub {}, 'self,text' ],
);
$parser->parse($content);
print $result;


   •   text
   •   start
   •   end
   •   process
   •   declaration
   •   comment
   •   default
HTML::Parser
start_h => [
    sub {
        my($self, $tagname, $attr, $text) = @_;
        $result .= "<$tagname";
        for my $key (sort keys %$attr) {
             my $value = $attr->{$key};
             if ($key =~ /^(?:src)$/i) {
                 # HTTP GET して保存してローカルパスにする
                 $value = get_src($value);
             }
             $result .= qq{ $key="$value"};
         }
         $result .= ">";
     },
     'self,tagname,attr,text',
],
HTML::Parser
default_h => [
    sub {
        my($self, $text) = @_;
        $result .= $text;
    },
    'self,text',
],
完
CSSから参照
$content =~ s{url(([^)]+))}{
    my $link = $1;

       # relative link (from HTML::ResolveLink)
       my $u = URI->new($link);
       unless (defined $u->scheme) {
           my $old = $u;
           $u = $u->abs($url);
       }
       $link = get_src($u); # HTTP GET して保存してローカルパスに
       "url($link)";
}eg;
script 殺す
my $context = { disallow => 0 };
my $disallow_tag = qr{script};
start_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}++; return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}--; return;
    }
}],
default_h => [sub {
    if ($context->{disallow} > 0) {
        return;
    }
}],
noscript 内を生かす
my $nodisplay_tag = qr{noscript};

start_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
base


start_h => [sub {
    if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)   {
        $value = "./";
    }
}],
できました!

gist.github.com/

   4071196
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;

use    DateTime;
use    Digest::SHA1 qw(sha1_hex);
use    Encode;
use    File::Path qw/make_path/;
use    HTML::Parser;
use    HTML::ResolveLink;
use    HTTP::Request::Common qw/GET/;
use    IO::All;
use    LWP::UserAgent;
use    URI;

my $path = './';

my    $uri        =   URI->new(shift) or die;
my    $now        =   DateTime->now;
my    $ymd        =   $now->ymd;
my    $ua         =   LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)');
my    $resolver   =   HTML::ResolveLink->new(base => $uri);
my    $res        =   $ua->request(GET $uri);
my    $content    =   $resolver->resolve($res->decoded_content);

my $dir           = $uri;
   $dir           =~ s{[^A-Za-z0-9.]+}{-}g;
   $dir           =~ s{-+$}{};
   $dir           = "$path/$dir/$ymd/";
   $dir           =~ s{/+}{/}g;

make_path($dir);

my $disallow_tag = qr{script};
my $nodisplay_tag = qr{noscript};

my $result;
my $context = { disallow => 0 };
my $parser    = HTML::Parser->new(
    api_version => 3,
    start_h      => [
        sub {
            my($self, $tagname, $attr, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}++;
                 return;
            }
            $result .= "<$tagname";
            for my $key (sort keys %$attr) {
                 $key eq '/' and next;
                 my $value = $attr->{$key};
                 if ($key =~ /^(?:src)$/i) {
                      $value = get_src($value);
                 } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i)        {
                      $value = get_link($value);
                 } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)        {
                      $value = $path;
                 }
                 $result .= qq{ $key="$value"};
            }
            $result .= ">";
        },
        'self,tagname,attr,text',
    ],
    end_h      => [
        sub {
            my($self, $tagname, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}--;
                 return;
            }
            $result .= $text;
        },
        'self,tagname,text',
    ],
    default_h    => [
        sub {
            my($self, $text) = @_;
            if ($context->{disallow} > 0) {
                 return;
            }
            $result .= $text;
        },
        'self,text',
    ],
);

$parser->parse($content);

$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX

$result = Encode::encode('utf-8', $result);

$result > io("${dir}index.html");

print "${dir}index.htmln";

sub get_src {
    my $src = shift or return;
    unless (-e "${dir}file") {
        make_path("${dir}file");
    }
    my $file = $src;
    $file =~ s{[^A-Za-z0-9.]+}{-}g;
    if (length($file) > 255) {
        $file = sha1_hex($file);
    }
    $file = "file/$file";
    $file =~ s{/+}{/}g;
    unless (-e "$dir$file") {
        $ua->request(GET $src)->content >> io("$dir$file");
        sleep(1); # DOS対策対策
       }
       $file;
}

sub get_link {
    my $url = shift or return;
    my $file = get_src($url);
    my $io = io("$dir$file");
    my $content = $io->slurp;
    $content =~ s{url(([^)]+))}{
        my $link = $1;
        $link =~ s{^[s"']+}{};
        $link =~ s{[s"']+$}{};

           # relative link (from HTML::ResolveLink)
           my $u = URI->new($link);
           unless (defined $u->scheme) {
               my $old = $u;
               $u = $u->abs($url);
           }
           $link = get_src($u);
           $link =~ s{^file/}{};
           "url($link)";
       }eg;
       $content > $io;
       return $file;
}
Google
Chrome
wget.pl
どうぞご利用ください!

gist.github.com/

   4071196
ご清聴ありがとうございました

Contenu connexe

Tendances

introduction to Django in five slides
introduction to Django in five slides introduction to Django in five slides
introduction to Django in five slides Dan Chudnov
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackVic Metcalfe
 
The History of PHPersistence
The History of PHPersistenceThe History of PHPersistence
The History of PHPersistenceHugo Hamon
 
Database Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlDatabase Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlAl-Mamun Sarkar
 
Database Design Patterns
Database Design PatternsDatabase Design Patterns
Database Design PatternsHugo Hamon
 
Uncovering Iterators
Uncovering IteratorsUncovering Iterators
Uncovering Iteratorssdevalk
 
PHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsPHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsAl-Mamun Sarkar
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsGuilherme Blanco
 
Doctrine fixtures
Doctrine fixturesDoctrine fixtures
Doctrine fixturesBill Chang
 
Perl Fitxers i Directoris
Perl Fitxers i DirectorisPerl Fitxers i Directoris
Perl Fitxers i Directorisfrankiejol
 
Object Calisthenics Applied to PHP
Object Calisthenics Applied to PHPObject Calisthenics Applied to PHP
Object Calisthenics Applied to PHPGuilherme Blanco
 

Tendances (20)

Database api
Database apiDatabase api
Database api
 
Inc
IncInc
Inc
 
introduction to Django in five slides
introduction to Django in five slides introduction to Django in five slides
introduction to Django in five slides
 
An Elephant of a Different Colour: Hack
An Elephant of a Different Colour: HackAn Elephant of a Different Colour: Hack
An Elephant of a Different Colour: Hack
 
The History of PHPersistence
The History of PHPersistenceThe History of PHPersistence
The History of PHPersistence
 
Database Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and MysqlDatabase Management - Lecture 4 - PHP and Mysql
Database Management - Lecture 4 - PHP and Mysql
 
PHP pod mikroskopom
PHP pod mikroskopomPHP pod mikroskopom
PHP pod mikroskopom
 
Database Design Patterns
Database Design PatternsDatabase Design Patterns
Database Design Patterns
 
Php
PhpPhp
Php
 
Php (1)
Php (1)Php (1)
Php (1)
 
Uncovering Iterators
Uncovering IteratorsUncovering Iterators
Uncovering Iterators
 
Agile database access with CakePHP 3
Agile database access with CakePHP 3Agile database access with CakePHP 3
Agile database access with CakePHP 3
 
linieaire regressie
linieaire regressielinieaire regressie
linieaire regressie
 
PHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post MethodsPHP Lecture 4 - Working with form, GET and Post Methods
PHP Lecture 4 - Working with form, GET and Post Methods
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object Calisthenics
 
Doctrine fixtures
Doctrine fixturesDoctrine fixtures
Doctrine fixtures
 
Perl Fitxers i Directoris
Perl Fitxers i DirectorisPerl Fitxers i Directoris
Perl Fitxers i Directoris
 
CGI.pm - 3ло?!
CGI.pm - 3ло?!CGI.pm - 3ло?!
CGI.pm - 3ло?!
 
Table through php
Table through phpTable through php
Table through php
 
Object Calisthenics Applied to PHP
Object Calisthenics Applied to PHPObject Calisthenics Applied to PHP
Object Calisthenics Applied to PHP
 

En vedette

Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Yasuhiro Onishi
 
Webサーバ勉強会#5
Webサーバ勉強会#5Webサーバ勉強会#5
Webサーバ勉強会#5oranie Narut
 
ウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングYasuhiro Onishi
 
Hatena blogdevelopmentflow
Hatena blogdevelopmentflowHatena blogdevelopmentflow
Hatena blogdevelopmentflowYasuhiro Onishi
 

En vedette (6)

開発合宿!!!!
開発合宿!!!!開発合宿!!!!
開発合宿!!!!
 
Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理Redmine::ChanでIRCからプロジェクト管理
Redmine::ChanでIRCからプロジェクト管理
 
oEmbed と Text::Hatena
oEmbed と Text::HatenaoEmbed と Text::Hatena
oEmbed と Text::Hatena
 
Webサーバ勉強会#5
Webサーバ勉強会#5Webサーバ勉強会#5
Webサーバ勉強会#5
 
ウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニングウェブアプリケーションのパフォーマンスチューニング
ウェブアプリケーションのパフォーマンスチューニング
 
Hatena blogdevelopmentflow
Hatena blogdevelopmentflowHatena blogdevelopmentflow
Hatena blogdevelopmentflow
 

Similaire à wget.pl

Perl Bag of Tricks - Baltimore Perl mongers
Perl Bag of Tricks  -  Baltimore Perl mongersPerl Bag of Tricks  -  Baltimore Perl mongers
Perl Bag of Tricks - Baltimore Perl mongersbrian d foy
 
Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Michael Schwern
 
Drupal Development (Part 2)
Drupal Development (Part 2)Drupal Development (Part 2)
Drupal Development (Part 2)Jeff Eaton
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHPTaras Kalapun
 
(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and ProfitOlaf Alders
 
Developing applications for performance
Developing applications for performanceDeveloping applications for performance
Developing applications for performanceLeon Fayer
 
Adventures in Optimization
Adventures in OptimizationAdventures in Optimization
Adventures in OptimizationDavid Golden
 
The Art of Transduction
The Art of TransductionThe Art of Transduction
The Art of TransductionDavid Stockton
 
Writing Maintainable Perl
Writing Maintainable PerlWriting Maintainable Perl
Writing Maintainable Perltinypigdotcom
 
Advanced php testing in action
Advanced php testing in actionAdvanced php testing in action
Advanced php testing in actionJace Ju
 
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Masahiro Nagano
 

Similaire à wget.pl (20)

Perl Bag of Tricks - Baltimore Perl mongers
Perl Bag of Tricks  -  Baltimore Perl mongersPerl Bag of Tricks  -  Baltimore Perl mongers
Perl Bag of Tricks - Baltimore Perl mongers
 
Bag of tricks
Bag of tricksBag of tricks
Bag of tricks
 
Daily notes
Daily notesDaily notes
Daily notes
 
Ae internals
Ae internalsAe internals
Ae internals
 
Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)Simple Ways To Be A Better Programmer (OSCON 2007)
Simple Ways To Be A Better Programmer (OSCON 2007)
 
PHP POWERPOINT SLIDES
PHP POWERPOINT SLIDESPHP POWERPOINT SLIDES
PHP POWERPOINT SLIDES
 
DBI
DBIDBI
DBI
 
Perl6 in-production
Perl6 in-productionPerl6 in-production
Perl6 in-production
 
Drupal Development (Part 2)
Drupal Development (Part 2)Drupal Development (Part 2)
Drupal Development (Part 2)
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHP
 
My shell
My shellMy shell
My shell
 
Perl5i
Perl5iPerl5i
Perl5i
 
(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit(Ab)Using the MetaCPAN API for Fun and Profit
(Ab)Using the MetaCPAN API for Fun and Profit
 
Developing applications for performance
Developing applications for performanceDeveloping applications for performance
Developing applications for performance
 
Adventures in Optimization
Adventures in OptimizationAdventures in Optimization
Adventures in Optimization
 
The Art of Transduction
The Art of TransductionThe Art of Transduction
The Art of Transduction
 
Presentation1
Presentation1Presentation1
Presentation1
 
Writing Maintainable Perl
Writing Maintainable PerlWriting Maintainable Perl
Writing Maintainable Perl
 
Advanced php testing in action
Advanced php testing in actionAdvanced php testing in action
Advanced php testing in action
 
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
Designing Opeation Oriented Web Applications / YAPC::Asia Tokyo 2011
 

Dernier

TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxMalak Abu Hammad
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024The Digital Insurer
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking MenDelhi Call girls
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking MenDelhi Call girls
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsJoaquim Jorge
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxKatpro Technologies
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationRadu Cotescu
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024Results
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking MenDelhi Call girls
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024The Digital Insurer
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Scriptwesley chun
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘RTylerCroy
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking MenDelhi Call girls
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptxHampshireHUG
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountPuma Security, LLC
 

Dernier (20)

TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
The Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptxThe Codex of Business Writing Software for Real-World Solutions 2.pptx
The Codex of Business Writing Software for Real-World Solutions 2.pptx
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path Mount
 

wget.pl

  • 1. All YOUR PAGE ARE BELONG TO US すべてのウェブページをこの手に 2012/11/16 株式会社はてな 大西康裕 id:onishi
  • 2.   id:onishi 大西康裕   ONISHI   @yasuhiro_onishi   株式会社はてな   はてなブログ
  • 6.
  • 7. HTML::Parser my $result; my $parser = HTML::Parser->new( start_h => [ sub {}, 'self,tagname,attr,text' ], default_h => [ sub {}, 'self,text' ], ); $parser->parse($content); print $result; • text • start • end • process • declaration • comment • default
  • 8. HTML::Parser start_h => [ sub { my($self, $tagname, $attr, $text) = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET して保存してローカルパスにする $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ],
  • 9. HTML::Parser default_h => [ sub { my($self, $text) = @_; $result .= $text; }, 'self,text', ],
  • 10.
  • 11.
  • 12. CSSから参照 $content =~ s{url(([^)]+))}{ my $link = $1; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET して保存してローカルパスに "url($link)"; }eg;
  • 13. script 殺す my $context = { disallow => 0 }; my $disallow_tag = qr{script}; start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } }], end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } }], default_h => [sub { if ($context->{disallow} > 0) { return; } }],
  • 14. noscript 内を生かす my $nodisplay_tag = qr{noscript}; start_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }], end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }],
  • 15. base start_h => [sub { if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = "./"; } }],
  • 17. #!/usr/bin/env perl use strict; use warnings; use utf8; use DateTime; use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.htmln"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOS対策対策 } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url(([^)]+))}{ my $link = $1; $link =~ s{^[s"']+}{}; $link =~ s{[s"']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }
  • 19.
  • 21.