How to extract information from an HTML document with Jsoup?

Asked

Viewed 4,116 times

2

I’ve been studying the Jsoup example for data extraction and extracted an example from this link Jsoup

But I tried to manipulate the example to extract the data from a Div instead of a Meta attribute and I couldn’t.

I want to upload the posts that people publish on a particular blog and upload them on the App page.

Can someone help me modify this code to get the DIV data.

package com.androidbegin.jsouptutorial;

import java.io.IOException;
import java.io.InputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import android.os.AsyncTask;
import android.os.Bundle;
import android.app.Activity;
import android.app.ProgressDialog;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.view.View;
import android.view.View.OnClickListener;
import android.widget.Button;
import android.widget.ImageView;
import android.widget.TextView;

public class MainActivity extends Activity {

    // URL Address
    String url = "http://www.androidbegin.com";
    ProgressDialog mProgressDialog;

    @Override
    public void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        // Locate the Buttons in activity_main.xml
        Button titlebutton = (Button) findViewById(R.id.titlebutton);
        Button descbutton = (Button) findViewById(R.id.descbutton);
        Button logobutton = (Button) findViewById(R.id.logobutton);

        // Capture button click
        titlebutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Title AsyncTask
                new Title().execute();
            }
        });

        // Capture button click
        descbutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Description AsyncTask
                new Description().execute();
            }
        });

        // Capture button click
        logobutton.setOnClickListener(new OnClickListener() {
            public void onClick(View arg0) {
                // Execute Logo AsyncTask
                new Logo().execute();
            }
        });

    }

    // Title AsyncTask
    private class Title extends AsyncTask<Void, Void, Void> {
        String title;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {
            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Get the html document title
                title = document.title();
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set title into TextView
            TextView txttitle = (TextView) findViewById(R.id.titletxt);
            txttitle.setText(title);
            mProgressDialog.dismiss();
        }
    }

    // Description AsyncTask
    private class Description extends AsyncTask<Void, Void, Void> {
        String desc;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {
            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Using Elements to get the Meta data
                Elements description = document
                        .select("meta[name=description]");
                // Locate the content attribute
                desc = description.attr("content");
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set description into TextView
            TextView txtdesc = (TextView) findViewById(R.id.desctxt);
            txtdesc.setText(desc);
            mProgressDialog.dismiss();
        }
    }

    // Logo AsyncTask
    private class Logo extends AsyncTask<Void, Void, Void> {
        Bitmap bitmap;

        @Override
        protected void onPreExecute() {
            super.onPreExecute();
            mProgressDialog = new ProgressDialog(MainActivity.this);
            mProgressDialog.setTitle("Android Basic JSoup Tutorial");
            mProgressDialog.setMessage("Loading...");
            mProgressDialog.setIndeterminate(false);
            mProgressDialog.show();
        }

        @Override
        protected Void doInBackground(Void... params) {

            try {
                // Connect to the web site
                Document document = Jsoup.connect(url).get();
                // Using Elements to get the class data
                Elements img = document.select("a[class=brand brand-image] img[src]");
                // Locate the src attribute
                String imgSrc = img.attr("src");
                // Download image from URL
                InputStream input = new java.net.URL(imgSrc).openStream();
                // Decode Bitmap
                bitmap = BitmapFactory.decodeStream(input);

            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }

        @Override
        protected void onPostExecute(Void result) {
            // Set downloaded image into ImageView
            ImageView logoimg = (ImageView) findViewById(R.id.logo);
            logoimg.setImageBitmap(bitmap);
            mProgressDialog.dismiss();
        }
    }
}

The structure of the site where I want to extract the data is like this:

<div class="postWrapper" id="post162">

   <div class="postTitle">
      <h2> Titulo do post </h2>

      <div class="fb-custom-share" data-url="http://url..."></div>

      <div class="date"> 26 de janeiro de 2015 </div>
   </div>
   <div class="postContent">
      Conteudo
   </div>
</div>

1 answer

2


Just pick up all the elements with class postWrapper which is where the content of your interest is, for that there is the method getElementsByClass. Then just iterate the found elements (which is a Elements) and access your children. Since they all have classes, you can use the same method mentioned above to catch them.

The only difference will be to get the link from that attribute data-* fb-custom-share. Where you will need to take the element by class and then the attribute.

element.getElementsByClass("fb-custom-share").attr("data-url");


Document document = Jsoup.connect(URL).get();

Elements elements = document.getElementsByClass("postWrapper");

for(Element element : elements){
   String title = element.select(".postTitle > h2").text();
   String share = element.getElementsByClass("fb-custom-share").attr("data-url");
   String date = element.getElementsByClass("date").text();
   String content = element.getElementsByClass("postContent").text();

   // salva em um ArrayList<ArrayList> ou um Map...
}

Follow a test code:

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Main {
    public static void main(String[] args) throws IOException {

        // Estrutura do HTML postado na pergunta.
        StringBuilder html = new StringBuilder();
        html.append("<div class=\"postWrapper\" id=\"post162\">")
                .append("<div class=\"postTitle\">")
                    .append("<h2>Título A</h2>")
                    .append("<div class=\"fb-custom-share\" data-url=\"linkA\"></div>")
                    .append("<div class=\"date\"> 26 de janeiro de 2015 </div>")
                .append("</div>")
                .append("<div class=\"postContent\">")
                    .append("Conteúdo A")
                .append("</div>")
            .append("</div>");

        // Faz o parse da String e tenta transformá-la em um documento.
        Document document = Jsoup.parse(html.toString());

        Elements elements = document.getElementsByClass("postWrapper");

        for(Element element : elements){
            System.out.println("Título: " + element.select(".postTitle > h2").text() +
                               "Link de compartilhamento: " + element.getElementsByClass("fb-custom-share").attr("data-url") +
                               "Data: " + element.getElementsByClass("date").text() +
                               "Conteúdo: " + element.getElementsByClass("postContent").text());
        }
    }
}

output:

Title: Title A
Share link: linka
Date: 26 of january 2015
Contents: Contents A

Browser other questions tagged

You are not signed in. Login or sign up in order to post.