crawler.pl
=pod Author: TR31N0RD Name: Crawler.pl Verion: 0.1 E-Mail: tr31n0rd@hotmail.com Tw: @TR31N0RD Greetz: All Members Of M1NDS =cut use LWP::UserAgent; use URI; use File::Basename; @Links=(); @List=(); $Peticiones=0; if ($#ARGV != 0) { Logo(); print "\nUsage: perl $0 http://example.com/index.php"; } else { Logo(); $c=0; #Agregamos la primera Ruta a la lista push(@List,$ARGV[0]); #Limpiamos / Al Final $r=chop($List[$c]); if($r ne "/"){$List[$c].=$r;} #Convertimos el String a URI $Link = URI->new($List[$c]); #Sacamos el protocolo Usado $Protocol=$Link->scheme(); #Sacamos el Host Principal $Host = $Link->host(); #Sacamos el Path Principal $PrincipalPath=$Link->path(); if($PrincipalPath eq ""){$PrincipalPath="//";} $PrincipalPath=SacarPath($PrincipalPath); print "Pagina: ".$Host."\n\n"; do { print "#".$c." URL: ".$List[$c]."\n"; $Link = URI->new($List[$c]); $Path = $Link->path(); if($Path eq ""){$Path="//";} $Path=SacarPath($Path); #print "Path: ".$Path."\n\n"; my $Source=GetContent($List[$c]); SacarURI($Source,"href=",$Path); SacarURI($Source,"src=",$Path); #Ignoramos las Rutas Externas foreach(@Crawl) { #Limpiado URL $_=~s/[\#|\?](.*$)//g; if($_=~/[\/|\\]$/){chop($_);} if($_=~/$Host/) { my $val=0; my $r1=$_; foreach(@List) { if($r1 eq $_){$val=1;last;} } if($val == 1){next;} push(@List,$_); } } $c++; }while($c<=$#List); print "\n\n$#List Archivos en $Peticiones Peticiones\n"; open (Lista, ">Crawler.txt"); foreach(@List) { print Lista $_."\n"; } close(Lista); } sub SacarPath() { my $Directorio=1; my $Original=$_[0]; my $r=chop($_[0]); if($r ne "/") { while(true) { $r=chop($_[0]); $Ruto.=$r; if($r eq '.'){$Directorio=0;} if($r eq '/'){last;} } if($Directorio==0) { return dirname($Original); } else { return $Original."/"; } } return $_[0]; } sub SacarURI() { my $Ruta=$_[2]; my(@Clean)=$_[0]=~m/($_[1]\"*(.*?)")/gi; @Links = grep(s/href=\"|src=\"|\"//gi,@Clean); #Sacando Rutas $Rutas=$Ruta; $Rutas=~s/\/[^\/]*$/\//; chop($Rutas); while($Rutas gt $PrincipalPath) { push(@Links,$Link->scheme().":\/\/".$Host.$Rutas); $Rutas=~s/\/[^\/]*$/\//; chop($Rutas); } foreach(@Links) { if($_=~/^http:\/\//i || $_=~/^https:\/\//i) { push(@Crawl,$_); } else { if($_=~/^mailto/i){next;} if($_=~/^\//i){push(@Crawl,"http://".$Host.$_);} else{push(@Crawl,$Link->scheme().":\/\/".$Host.$Ruta.$_);} } } } sub GetContent() { $Peticiones++; $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12'); $response = $ua->get($_[0]); return $response->content; } sub Logo() { print" # # # ## # # ##### #### ## ## # # ## # # # # # ## # # # # # # # #### # # # # # # # # # # # # # ## # # # # # # ##### # # ##### #### "; }