Grab a value inside the HTML Curl

Asked

Viewed 3,860 times

4

I’m trying to get a value that is inside the html of a page

<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">

I want to take only the value ref=fvFCF9D8N4Ak

I am using Curl and php and thought the solution was a regex .

more like I should take this amount ?

2 answers

3

Assuming you’re using Curl to pull the HTML from an address, and then use PHP to collect certain collected HTML data:

You can use the class DOMDocument to do the parse HTML, find the tag <a/> and collect the value of the attribute href.

Then making use of the function parse_url() can extract the query string of the same, being this what you intend:

Example

// o HTML que recolheste
$html = '<html>
<head></head>
<body>
<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">bubu</a>
</body>
</html>';

// Instanciar o DOMDocument
$dom = new DOMDocument;

// Carregar o HTML recolhido para o DOMDocument
@$dom->loadHTML($html);

// Percorrer o DOM e por cada tag 'a' encontrada
foreach ($dom->getElementsByTagName('a') as $tag) {

    // apanhar o valor do atributo 'href'
    $href = $tag->getAttribute('href');

    // se não estiver vazio
    if (!empty($href)) {

        // guardar a query string numa variável
        $queryString = parse_url($href, PHP_URL_QUERY);  // Resultado: ref=fvFCF9D8N4Ak
    }
}

See example working on Ideone.


If you only have the HTML present in the question, the method is exactly the same:

$html = '<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">';
 
$dom = new DOMDocument;
@$dom->loadHTML($html);
 
foreach ($dom->getElementsByTagName('a') as $tag) {
 
    $href = $tag->getAttribute('href');
 
    if (!empty($href)) {
 
        $queryString = parse_url($href, PHP_URL_QUERY); // Resultado: ref=fvFCF9D8N4Ak
    }
}

See example working on Ideone.

0

Hello friend you can try this solution with Regex ( I made a simple class for the test) :

<?php

/**
 * A simple crawler
 * By Rodrigo Nascimento
 * 
 */
set_time_limit(0);
error_reporting(E_ALL);

Class SimpleCrawler {

    private $url;
    private $userAgent;
    private $httpResponse;

    function __construct() {
        $this->userAgent       = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0";
        $this->chocolateCookie = "chocolateCookies.txt";
    }

    /**
     * Seta a url alvo
     * @param string $url
     * @return SimpleCrawler
     */
    public function setUrl($url) {
        $this->url = $url;
        return $this;
    }

    /**
     * Requisição get
     * @return SimpleCrawler
     */
    private function get(){
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $this->url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        $this->httpResponse = curl_exec($ch);
        return $this;
    }

    /**
     * Pega o conteudo da requisição
     * @return SimpleCrawler
     */
    public function getPageContent() {
        return $this->httpResponse;
    }

    /**
     * Faz a navegação na página especificado por self::setUrl
     * @return SimpleCrawler
     */
    public function navigate() {
        $this->get();

        return $this;
    }
}

/* Estancia do nosso objeto que se baseia nos seguintes métodos:
 * 
 * Definir uma url: $simpleCrawler->setUrl('site');
 * Navegar em dada url: $simpleCrawler->navigate();
 * E por fim ter acesso ao conteúdo da requisição: $simpleCrawler->getPageContent();
 * 
 */

$simpleCrawler = new SimpleCrawler;
$simpleCrawler->setUrl("http://siteQualquer")
              ->navigate();

$conteudo  = $simpleCrawler->getPageContent();
$urlResult = (preg_match("#?ref=(.*?)\">#", $conteudo, $match)) ? $match[1] 
                                                                : "Não foi possível obter a url solicitada via Regex.";

echo $urlResult . PHP_EOL;

Browser other questions tagged

You are not signed in. Login or sign up in order to post.