Extract text between HTML tags with Indy Idhttp with Delphi

Asked

Viewed 7,237 times

4

I have an html site that contains:

<html>
<head>
<title>Teste</title>
</head>
<body>
<h1>Teste 1</h1>
<h2>Teste 2</h2>
</body>
</html>

I am extracting the content from the site and playing on MEMO with:

IdHTTP1 := TIdHTTP.Create(nil);
IdHTTP1.Request.Accept := 'text/html, */*';
IdHTTP1.Request.UserAgent := 'Mozilla/3.0 (compatible; IndyLibrary)';
IdHTTP1.Request.ContentType := 'application/x-www-form-urlencoded';
IdHTTP1.HandleRedirects := True;
HTML := IdHTTP1.Get('http://www.site.com/link.html');
Memo1.Text := (HTML);

The problem that I can’t remove the content between tags <h1> .. </h1> ie, Test 1 and play on a label.

  • How are you doing to try to extract?

2 answers

5

There’s this one Function here that I use to make this of files HTML and/or XML :)

function ExtractText(aText, OpenTag, CloseTag : String) : String;
{ Retorna o texto dentro de 2 tags (open & close Tag's) }
var
  iAux, kAux : Integer;
begin
  Result := '';

  if (Pos(CloseTag, aText) <> 0) and (Pos(OpenTag, aText) <> 0) then
  begin
    iAux := Pos(OpenTag, aText) + Length(OpenTag);
    kAux := Pos(CloseTag, aText);
    Result := Copy(aText, iAux, kAux-iAux);
  end;
end;

Parameters:

  • aText: would be XML or HTML content;
  • Opentag: would be the tag that opens (in your case, for example <h1>);
  • Closetag: would be the tag that closes (in your case, for example </h1>);

Then for you to make the call of this Function would, for example:

variavelString = ExtractText(Memo1.Text,'<h1>','</h1>');

I hope I helped. Hug!

-2

First I replace the tag <p> e </p> by breaking the Delphi line, then I go through the text looking for the HTML input < and output > ; Finally I remove HTML special characters.

function TFuncoes.RetiraTagsHTML(ConteudoHTML: string): string;
var
  TagInicio, TagFim, TagLength: integer;
begin
  //substituo primeiro os parágrafos para saltar a linha
  ConteudoHTML := StringReplace(StringReplace(ConteudoHTML//
    , '</p>', '</p>' + #10#13, [rfReplaceAll])      //
    , ' /p>', ' /p>' + #10#13, [rfReplaceAll]);

  TagInicio := Pos('<', ConteudoHTML); // posição de busca de primeira <

  while (TagInicio > 0) do
  begin  // enquanto há uma < em S
    TagFim := Pos('>', ConteudoHTML);       // encontra a correspondente >
    TagLength := TagFim - TagInicio + 1;
    Delete(ConteudoHTML, TagInicio, TagLength); // exclui a tag
    TagInicio := Pos('<', ConteudoHTML);       // pesquisa a proxima <
  end;

  //Por último substituo os caractéres especiais do HTML
  ConteudoHTML := StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(StringReplace(//
    ConteudoHTML //
    , '&aacute;', 'á', [rfReplaceAll]) //
    , '&eacute;', 'é', [rfReplaceAll]) //
    , '&iacute;', 'í', [rfReplaceAll]) //
    , '&oacute;', 'ó', [rfReplaceAll]) //
    , '&uacute;', 'ú', [rfReplaceAll]) //
    , '&acirc;', 'â', [rfReplaceAll])  //
    , '&ecirc;', 'ê', [rfReplaceAll])  //
    , '&ocirc;', 'ô', [rfReplaceAll])  //
    , '&agrave;', 'à', [rfReplaceAll]) //
    , '&uuml;', 'ü', [rfReplaceAll])   //
    , '&ccedil;', 'ç', [rfReplaceAll]) //
    , '&atilde;', 'ã', [rfReplaceAll]) //
    , '&otilde;', 'õ', [rfReplaceAll]) //
    , '&ntilde;', 'ñ', [rfReplaceAll]) //
    , '&Aacute;', 'Á', [rfReplaceAll]) //
    , '&Eacute;', 'É', [rfReplaceAll]) //
    , '&Iacute;', 'Í', [rfReplaceAll]) //
    , '&Oacute;', 'Ó', [rfReplaceAll]) //
    , '&Uacute;', 'Ú', [rfReplaceAll]) //
    , '&Acirc;', 'Â', [rfReplaceAll])  //
    , '&Ecirc;', 'Ê', [rfReplaceAll])  //
    , '&Ocirc;', 'Ô', [rfReplaceAll])  //
    , '&Agrave;', 'À', [rfReplaceAll]) //
    , '&Ccedil;', 'Ç', [rfReplaceAll]) //
    , '&Atilde;', 'Ã', [rfReplaceAll]) //
    , '&Otilde;', 'Õ', [rfReplaceAll]) //
    , '&Ntilde;', 'Ñ', [rfReplaceAll]) //
    , '&amp;', '&', [rfReplaceAll])    //
    , '&quot;', '"', [rfReplaceAll])   //
    , '&lt;', '<', [rfReplaceAll])     //
    , '&gt;', '>', [rfReplaceAll])     //
    , '&nbsp;', ' ', [rfReplaceAll]    //
  );

  Result := ConteudoHTML; // retorna o resultado
end;
  • Hi Rodrigo, All right! Dude. Congratulations on the answer, but given that many other people may read it in the future, could you describe your solution and code? helps a lot of people, it’s one of the tips that the community tries to pass on to everyone. well-crafted answers.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.