Crawler for Woocommerce

Asked

Viewed 87 times

0

Friends good afternoon.

I’m developing a php Crawler that will make Scrapping some urls that I will inform.

I’m trying to get him to pull the values of a dynamic url, but I’m not getting it.

Could someone help me.

<?php
$page_title = "MiniCrawler";
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <title><?php print($page_title) ?></title>
</head>
<body>

<?php  
// error handling
ini_set('display errors',1);  
error_reporting(E_ALL|E_STRICT); 


include_once ('simple_html_dom.php');

function limpaXml($texto){
    return htmlspecialchars(html_entity_decode($texto, ENT_QUOTES, 'UTF-8'),ENT_QUOTES, 'UTF-8');
}


function dlPage($href) {

    $curl = curl_init();
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($curl, CURLOPT_HEADER, false);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_URL, $href);
    curl_setopt($curl, CURLOPT_REFERER, $href);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
    $str = curl_exec($curl);
    curl_close($curl);

    // Create a DOM object
    $dom = new simple_html_dom();
    // Load HTML from a string
    $dom->load($str);

    return $dom;
    }

// Settings on tophttp://www.dafiti.com.br/special-price/
$sitesToCheck = array(
                    // id is the page ID for selector
    array(

        "url" => "http://www.adidas.com.br/homem-outlet?sz=48","categoria" => "adidas" , "selector" => "div#product-grid div.product-tile div.hockeycard div.innercard"
        )


);
$savePath = "json/";
$emailContent = "";
$xml = new SimpleXMLElement('<xml/>');

// For every page to check...
foreach($sitesToCheck as $site) {

    $url = $site["url"];
    if (!empty($url)){
        //$numProdutos = dlPage($url);
        //$numTotal = $numProdutos->find("p.count");
        //$pag = 48;
        //(int)(((int)$numTotal[0]->plaintext)/96);
        $total = 432; 
        $porPag = 48;
        $pag = $total / $porPag;


        //$numProdutos->clear();
        //unset($numProdutos);
        for ($i=24;$i>$pag;){

            $valorFinal = $porPag + $i + $i;
            $novaUrl = $url."&start=".$valorFinal;
            $novoSite = $xml->addChild('website');
            $novoSite->addChild('webUrl', $novaUrl);
            $novoSite->addChild('categoria', $site["categoria"]);
            //var_dump($site);
            // Calculate the cachedPage name, set oldContent = "";
            $fileName = strtolower($site["categoria"]);
            $oldContent = "";
            // Get the URL's current page content
            $html = dlPage($novaUrl);
                if (!empty($html)){
                    $produtos = $novoSite->addChild('produtos');
                    $total=0; 
                    // Find content by querying with a selector, just like a selector engine!
                    foreach($html->find($site["selector"]) as $element) {
                        if(isset($element)){
                            $link = $element->find('div.image a', 0);
                                $img = $element->find('div.image a img.show', 0);
                            if(!empty($element->find('div.product-info-wrapper', 0)->plaintext)){
                                $produto['marca'] = $site["categoria"]; //$element->find('div.product-box-brand', 0)->plaintext; 
                                $produto['titulo'] = limpaXml($element->find('div.clearfix a span.title', 0)->plaintext);
                                $produto['preco_old'] = $element->find('div.clearfix div.price span.strike', 0)->plaintext;
                                $produto['preco'] = $element->find('div.clearfix div.price span.salesprice', -1)->plaintext;
                                //$produto['preco'] = $element->find('span.product-box-price-from', 0)->plaintext;

                                $produto['url'] = htmlspecialchars($link->href);
                                $produto['imagem'] = $img->getAttribute('data-original');
                                //$json[] = json_encode($produto);
                                $produtoNovo = $produtos->addChild('produto');
                                $produtoNovo->addChild('titulo',$produto['titulo']);
                                $produtoNovo->addChild('marca',$produto['marca']);
                                $produtoNovo->addChild('preco',$produto['preco']);
                                $produtoNovo->addChild('preco_old',$produto['preco_old']);
                                $produtoNovo->addChild('url',$produto['url']);
                                $produtoNovo->addChild('imagem',$produto['imagem']);
                                echo "$novoSite";
                                echo  '<div style="float:left; width=300px; border:1px solid #000; padding:10px;"><a href="'.$produto['url'].'"><img src="'.$produto['imagem'].'"></a><br><p>Nome do Produto: '.$produto['titulo'].'</p><p>Marca: '.$produto['marca'].'</p><p>Preço Antigo: '.$produto['preco_old'].'</p><p>Preço: '.$produto['preco'].'</p><p>URL: '.$novaUrl.'</p></div>';
                                //echo 'Nome do Produto: '.$produto['titulo'].'<br />';
                                $total++;
                            }
                        }

                    }

                }

                //Header('Content-type: text/xml');
                $arquivo = $xml->asXML();
                // Save new content
                file_put_contents($savePath.$fileName.'.xml',$arquivo);

                $html->clear();
                unset($html);
            $i = $i;


        }

    }

}



// Retrieve the DOM from a given URL


/*
$html = file_get_html('http://www.dafiti.com.br/Sandalia-Anabela-DAFITI-SHOES-Caramelo-1746512.html');

// Find all "A" tags and print their HREFs
/*foreach($html->find('a') as $e) {
    echo $e->href . '<br>';
}*/

// Retrieve all images and print their SRCs
/*foreach($html->find('a.gallery-thumb[data-img-zoom]') as $e)
    echo '<img src="'.$e . '"><br>';

// Find all anchors and images 


// Find all images, print their text with the "<>" included
/*foreach($html->find('img') as $e)
    echo $e->outertext . '<br>';
*/
// Find the DIV tag with an id of "myId"
//foreach($html->find('div#myId') as $e)
  //  echo $e->innertext . '<br>';

// Find all SPAN tags that have a class of "myClass"
/*foreach($html->find('div.detail-row') as $e)
    echo $e->innertext . '<br>';

// Find all TD tags with "align=center"
foreach($html->find('td[align=center]') as $e)
    echo $e->innertext . '<br>';



/*
$target_url = "http://www.eleshop.com.br/";
$html = new simple_html_dom();
$html->load_file($target_url);


foreach($html->find('img') as $link){
echo '<img src="'. $link->src.'"><br />';
}
*/
?>

</body>
</html>
  • Well, you didn’t say because you’re not getting it, there are several points like accessing the URL, reading the content and so on, but at first I would say that doing something like this is very complex, laborious and illegal depending on what is picking up and if the guy changes something of what is picking up or blocking his IP is already then ... not to say that I did not give an alternative has ever thought to use some suited for this type Import io.

  • Diego. Thank you.

No answers

Browser other questions tagged

You are not signed in. Login or sign up in order to post.