<?
 
 
/**
 
* About author:
 
*  vivekanandan
 
* email: [email protected]
 
*
 
* If you want to any help on spider ot any thing in php just mail me 
 
*
 
* About class:
 
*  WebSpider    -  constructor set teh domain & url to map it 
 
*  
 
*   processTagInPageData()     - it process the anchor tag & frame tag as googlebot does
 
*   fetchURLPageData()         - it returns the html page content for the given URL
 
*     isURLExists()            - it checks wheather the given url is added in DB  
 
*   displayDomainRecords    - it displays teh records  from DB    
 
*     StoreUniqueURL             - it stores the unique url in DB 
 
*     processSpecificTagbyType-  it parse each & every tag & truncate the parsed tag from the string 
 
*/
 
 
 
 
ini_set("display_errors",1); 
 
 
 
/* table structure 
 
 
CREATE TABLE `spider` (
 
  `id` bigint(20) NOT NULL auto_increment,
 
  `domain` varchar(150) NOT NULL,
 
  `url` varchar(2000) NOT NULL,
 
  `parentid` int(11) NOT NULL,
 
  `visitflag` int(11) NOT NULL,
 
  `type` varchar(20) NOT NULL,
 
  `level` mediumint(9) NOT NULL,
 
  PRIMARY KEY  (`id`)
 
)
 
*/
 
 
 
class WebSpider {
 
 
var $mMaxDepth;
 
var $mDomain;
 
var $mDBHost;
 
var $mDBUserName;
 
var $mDBPassword;
 
var $mDBDatabase;
 
var $mURLPageData;
 
var $mURL;
 
 
 
    function WebSpider($pmDomain, $pmDepth,$pmURL) { 
 
    
 
        $this->mDomain        = $pmDomain;
 
        $this->mMaxDepth    = $pmDepth;
 
        $this->mURL            = $pmURL;
 
                    
 
    }
 
    
 
    function isURLExists($pmDomain, $pmURL) {  
 
 
        mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 
        mysql_select_db($this->mDBDatabase);  
 
        
 
        $vSQL  = "SELECT count( id ) AS cnt FROM spider
 
                    WHERE  domain = '$pmDomain' and url = '$pmURL'";
 
        $rs = mysql_query($vSQL); 
 
        $oRecord = mysql_fetch_assoc($rs);
 
        return $oRecord['cnt'];
 
        
 
    }
 
 
    function displayDomainRecords($pmDomain){
 
    
 
           mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 
        mysql_select_db($this->mDBDatabase);          
 
        
 
        $vSQL = "select count(id) as cnt   from  spider where domain = '$pmDomain' ";
 
        $rsURLList = mysql_query($vSQL);
 
        $vCnt = mysql_fetch_assoc($rsURLList);
 
        
 
            
 
        $vSQL = "select *   from  spider where domain = '$pmDomain' order by id asc ";
 
        $rsURLList = mysql_query($vSQL);
 
        print "<strong>Domain</strong> : ".$pmDomain ." <strong>Total URL</strong> :".$vCnt['cnt'];
 
        ?>
 
        <table width='80%'  cellspacing='2' cellpadding='2' border="1">
 
          <tr>
 
            <td><strong>URL</strong></td>
 
            <td><strong>Type</strong></td>
 
          </tr>
 
        <?
 
        while($aRec = mysql_fetch_assoc($rsURLList)) { ?>            
 
             <tr>
 
            <td><?php echo $aRec['url'] ?></td>
 
            <td><?php echo htmlspecialchars($aRec['type'] )?></td>
 
 
          </tr>
 
         <? }  ?> 
 
         </table>
 
        
 
        <? 
 
        
 
 
    
 
    }
 
    
 
    function StoreUniqueURL($pmDomain, $pmURL, $pmParentId=0, $pmLevel,$pmType){
 
    
 
        mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 
        mysql_select_db($this->mDBDatabase);           
 
    
 
        $pmURL = mysql_real_escape_string($pmURL); 
 
        if($this->isURLExists($pmDomain,$pmURL)==0) {      
 
            $vURLSQL = " INSERT INTO `spider` ( domain, `url` , `parentid` , `visitflag` , `type` , `level` )
 
                            VALUES ('$pmDomain' , '$pmURL', '$pmParentId', '0', '$pmType', '$pmLevel' )";                
 
            mysql_query($vURLSQL);     
 
        }
 
        
 
    }
 
 
 
    function fetchLinkfromTag($pmData, $pmTagName, $pmAtributeName){ 
 
    
 
        $vPos = strpos($pmData, $pmTagName);
 
        if($vPos === false){
 
            return false; // if no link found stop search 
 
        }
 
        $vPos += strlen($vStr);     
 
        $vSubStr = substr($pmData,$vPos);
 
        
 
        $vHrefPos = strpos($vSubStr, $pmAtributeName);
 
        $vSubStr = substr($vSubStr,  $vHrefPos);
 
    
 
        
 
        $url = explode('"',$vSubStr);    
 
        return array("url"=>$url[1],"str"=>$vSubStr);
 
        
 
    }
 
 
 
 
 
    function fetchURLPageData($vURL) { 
 
 
        $rCurlRes = curl_init();
 
        curl_setopt($rCurlRes, CURLOPT_URL,$vURL);        
 
        curl_setopt($rCurlRes, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13)');
 
        curl_setopt($rCurlRes, CURLOPT_REFERER, $this->mDomain);
 
        curl_setopt($rCurlRes, CURLOPT_AUTOREFERER, true);       
 
        curl_setopt($rCurlRes, CURLOPT_HEADER, 0); // set to 0 to eliminate header info from response
 
        curl_setopt($rCurlRes, CURLOPT_RETURNTRANSFER, 1); // Returns response data instead of TRUE(1    
 
        $res = curl_exec($rCurlRes);
 
        return $res;
 
    }
 
    
 
   function ProcessSpiderInit() { 
 
         
 
    $this->StoreUniqueURL($this->mDomain, $this->mURL, 0,1,'index');  
 
    
 
   }
 
     
 
 
   function processSpecificTagbyType($pmData, $pmTagName, $pmAttribute) {
 
      
 
    do {    
 
            $aResult = $this->fetchLinkfromTag($pmData,$pmTagName, $pmAttribute);             
 
            
 
            $vURL     = $aResult['url'];
 
            $pmData = $aResult['str'];             
 
            if($pmData) {
 
                $this->StoreUniqueURL($this->mDomain, $vURL , 1,  1, $pmTagName);  
 
            }
 
            $vIndex++;                
 
            
 
     }while($pmData);     
 
   
 
   }   
 
 
 
   
 
   function processTagInPageData($pmData) {  
 
   
 
          $this->processSpecificTagbyType($pmData,'<a',"href=");  
 
       $this->processSpecificTagbyType($pmData,'<frame', "src=");       
 
   
 
   }
 
   
 
   
 
   function fetchURLDataandParseURL() { 
 
 
      $vData   = $this->fetchURLPageData($this->mURL);      
 
         $this->processTagInPageData($vData);        
 
  }        
 
 
}
 
 
?>
 
 |