As i wanted loads of URLs to test some SVG validating experiments on i set out to write a spider. Somehow the HttpRequest didn't get installed right, so i messed up my code a bit more, making it way less efficient along the way, by using web-sniffer. That didn't work right away, i lost energy, and now i put it here so i (or better, someone else) can find it and improve it later.
<?php
header("Content-Type:text/plain");
$link=array();
$link['uri']="http://svg.startpagina.nl";
$link['level']=0;
$low=0;
$high=$low;
$links=array();
$links[$low]=$link;
$deepest=3;
while(true){
search($links[$low++]);
}
function extension($url){ // svg html
$path_parts = pathinfo(parse_url($url,PHP_URL_PATH));
return $path_parts['extension'];
}
function hPos($end){
return (stripos($content,"href=",$end));
}
function addLinks($content,$level){
global $links,$deepest,$high;
$end=1;
if($level<$deepest){
for($start=hPos($end); $start>0; $start=hPos($end)){
$opensq=strpos($content,"'",$start);
$opendq=strpos($content,'"',$start);
if ($opensq<$opendq) {
$quot="'";
}else{
$quot='"';
}
$start=min($opensq,$opendq)+1;
$end=strpos($content,$quot,$start)-1;
$extra=array();
$extra['uri']=substr($content,$start,$end-$start+1);
$extra['level']=$level+1;
$links[++$high]=$extra;
}
}
}
function search($link){
global $links,$deepest;
print_r($links);
$uri=$link['uri'];
if ($uri!=""){
$level=$link['level'];
if ($level>$deepest) {exit();}
if ($level==0) {
$ext="html";
}else{
$ext=extension($uri);
}
if ($ext=="svg") { echo "\n".$uri ; }
//$req = new HttpRequest($uri, HttpRequest::METH_GET); // Parse error: parse error, expecting `'('' in c:\program files\easyphp1-8\www\megavalidate.php on line 62
//try{
/*
$req->send();
if ($req->getResponseCode() == 200) {
$mime=$req->getResponseHeader("Content-Type");
*/
sleep(10);
$sniffURL="http://web-sniffer.net/?url=".rawurlencode($uri)."&submit=Submit&http=1.1&gzip=yes&type=HEAD&ua=Mozilla%2F5.0+%28Windows%3B+U%3B+Windows+NT+5.1%3B+en-US%3B+rv%3A1.8.1%29+Gecko%2F20061010+Firefox%2F2.0+Web-Sniffer%2F1.0.24";
echo("\nsniffURL=$sniffURL");
$sniffing=file_get_contents($sniffURL);
echo("\n\n\nsniffing=$sniffing");
$before="Content-Type:</td><td>";
$start=strrpos($sniffing,$before)+strlen($before);
$after=strpos($sniffing,"</td>",$start);
$mime=substr($sniffing,$start,$after-$start);
echo ("\nstart=$start");
echo ("\nafter=$after");
echo ("\nmime=$mime");
if (($mime=="image/svg+xml")&&($ext!="svg")) {echo("\n$uri");}
if (($mime!="image/svg+xml")&&($ext=="svg")) {echo("ERROR");}
if ($mime=="text/html"){
//$content=$req->getResponseBody();
$content=file_get_contents($uri);
$extraLinks=addLinks($content,$level);
}
}
/*
} catch (HttpException $ex) {
echo $ex;
}
}
*/
}
?>