How to scrape Qlikview tables using Nodejs?

Asked

Viewed 172 times

2

This website of the Brazilian government presents salary data to judges of various courts and tribunals. I would like to download all tables, but the data relating to the tables are not in the html I receive as a response when I use request.

To get around this problem, I used puppeteer and cheerio to open a browser, wait for the table to load and then use a Jquery selector and extract the data. This is my code:

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");


const main = async () => {
    const browser = await puppeteer.launch({ headless: false});
    const page = await browser.newPage();
    await page.goto("https://paineis.cnj.jus.br/QvAJAXZfc/opendoc.htm?document=qvw_l%2FPainelCNJ.qvw&host=QVS%40neodimio03&anonymous=true&sheet=shPORT63Relatorios");
    await sleep(10*1000);
    const html = await page.content();
    const $ = cheerio.load(html);
    console.log($(".injected").text())

}

async function sleep(miliseconds) {
    return new Promise(resolve => setTimeout(resolve, miliseconds));
}

main();

The problem is that the table I receive as response is incomplete, with few incomplete lines and cells:

P63_CE_TRIBUNALCNJTribunalMagistradoMês/Ano Ref.CNJADHAILTON LACET CORREIA PORTO12/2018ADRIANA FRANCO MELO MACHADO02/202103/202104/2021ADRIANA LINS DE OLIVEIRA BEZERRA12/2018ADRIANO DA SILVA ARAUJO08/201909/201910/201911/201912/201901/202002/202003/202004/202005/202006/202007/202008/202009/202010/202011/202012/202001/202102/202103/202104/2021ALESSANDRA VARANDAS PAIVA MA...12/2018ALEXANDRE CHINI NETO09/201810/2018Subsídio (R$)Direitos Pessoais (1)Indenizações (2)Direitos Eventuais (3)Total de Rendimentos (4)Previdência Pública (5) (R$)Imposto de Renda (6) (R$)Descontos Diversos (7) (R$)Retenção por Teto Constitucional (8) (R$)Total de Descontos (9)Rendimento Líquido (10)Remuneração do órgão de origem (11) (R$)Diárias (12) (R$)0,000,000,00463,16463,160,000,000,000,000,00463,160,000,001.698,450,000,000,001.698,450,000,000,000,000,001.698,4533.689,110,003.639,540,0067.378,220,0071.017,760,00191,130,000,00191,1370.826,6333.689,110,003.639,540,000,000,003.639,540,00191,130,000,00191,133.448,4133.689,110,000,000,000,004.631,614.631,610,001.272,050,000,001.272,053.359,560,000,003.371,830,000,000,003.371,830,00150,970,000,00150,973.220,8632.004,710,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,719.100,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,717.700,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,711.750,005.323,940,000,002.218,317.542,250,00618,290,000,00618,296.923,9632.004,719.100,005.323,940,000,002.661,977.985,910,00594,720,000,00594,727.391,1932.004,715.600,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,0032.004,710,0037.328,650,00594,720,000,00594,7236.733,9332.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,710,005.323,940,004.158,850,009.482,790,00594,720,000,00594,728.888,0732.004,710,005.323,940,004.158,850,009.482,790,00673,560,000,00673,568.809,2332.004,710,005.323,940,004.158,85286,699.769,480,00673,560,000,00673,569.095,9232.004,710,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,719.100,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,000,004.436,629.760,560,001.189,440,000,001.189,448.571,1232.004,714.550,005.323,940,000,002.661,977.985,910,00594,720,000,00594,727.391,1932.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,005.323,940,000,000,005.323,940,00594,720,000,00594,724.729,2232.004,714.550,000,000,000,004.631,614.631,610,001.272,050,000,001.272,053.359,560,000,003.127,300,000,000,003.127,300,00161,200,000,00161,202.966,1028.947,550,003.127,300,000,000,003.127,300,00114,300,000,00114,303.013,0028.947,5511.900,00

I tried several variations of the Jquery selector unsuccessfully.

I read that I could communicate with Qlikview using enigmajs and then make my request. However, not even the most basic example of documentation worked correctly on the site I’m using.

How do I scrape data from a Qlikview table?

  • 2

    who negatively suggested a change in the question?

2 answers

3


You can click the button Imprimir using the selector div[title=Imprimir. After that a new tab with the data will open, just select this tab and the function Page.evaluate to fetch the data you want from the page. I did in the example below simply picking up the line, but as there are some data that are for more than one line you will need to elaborate a little more this part:

const puppeteer = require('puppeteer');

const aguardar = ms => new Promise(resolve => setTimeout(resolve, ms));

const obterDados = () => {
  const linhas = document.querySelectorAll('table:nth-of-type(1) > tbody > tr:not(:nth-of-type(1)):not(:nth-of-type(2))'); 
  return Array.from(linhas, (linha) => {
    const colunas = linha.querySelectorAll('td:not(:nth-of-type(1))');
    return Array.from(colunas, coluna => coluna.innerText);
  });
};

const principal = async () => {
  const navegador = await puppeteer.launch({ headless: false });
  const pagina = await navegador.newPage();
  await pagina.goto('https://paineis.cnj.jus.br/QvAJAXZfc/opendoc.htm?document=qvw_l%2FPainelCNJ.qvw&host=QVS%40neodimio03&anonymous=true&sheet=shPORT63Relatorios');
  await aguardar(5 * 1000);
  await pagina.waitForSelector('div[title=Imprimir]', { timeout: 0 });
  await pagina.click('div[title=Imprimir]');
  await aguardar(5 * 1000);

  // Aguarda ao menos 10 segundos depois de clicar
  await Promise.all([
    pagina.waitForSelector('div.ModalDialog', { hidden: true, timeout: 0 }),
    aguardar(10 * 1000),
  ]);

  const paginas = await navegador.pages();
  const paginaRelatorio = paginas[2];
  await paginaRelatorio.bringToFront();
  await paginaRelatorio.waitForSelector('table.currentsel', { visible: true, timeout: 0 });

  const dados = await paginaRelatorio.evaluate(obterDados);
  console.log(dados);
};

principal();

Which will result in something similar to:

[
  [
    'ADHAILTON LACET CORREIA PORTO',
    '12/2018',
    '0,00',
    '0,00',
    '0,00',
    '463,16',
    '463,16',
    '0,00',
    '0,00',
    '0,00',
    '0,00',
    '0,00',
    '463,16',
    '0,00',
    '0,00'
  ],
  [
    '02/2021',   '1.698,45',
    '0,00',      '0,00',
    '0,00',      '1.698,45',
    '0,00',      '0,00',
    '0,00',      '0,00',
    '0,00',      '1.698,45',
    '33.689,11', '0,00'
  ],
  [
    '3.639,54',  '0,00',
    '67.378,22', '0,00',
    '71.017,76', '0,00',
    '191,13',    '0,00',
    '0,00',      '191,13',
    '70.826,63', '33.689,11',
    '0,00'
  ],
  [
    '3.639,54', '0,00',
    '0,00',     '0,00',
    '3.639,54', '0,00',
    '191,13',   '0,00',
    '0,00',     '191,13',
    '3.448,41', '33.689,11',
    '0,00'
  ],
  [
    '3.639,54', '0,00',
    '0,00',     '1.516,48',
    '5.156,02', '0,00',
    '191,13',   '0,00',
    '0,00',     '191,13',
    '4.964,89', '33.689,11',
    '0,00'
  ],
  ... XXXX more items
]

Observing: You don’t need the cheerio to search the data on the page.

0

Entering the site you sent, I saw that has a Print icon, clicking on it, is generated a link in HTML.

I believe that from to parse everything you need for him: https://paineis.cnj.jus.br/QvAjaxZfc/QvsViewClient.aspx?public=only&size=long&host=QVS%40neodimio03&name=Temp/1ac864e398f6455cb03e8bfd1eb19cc2.html

By the name of the parameter: name=Temp/ I don’t know if it’s really something temporary, so I suggest you create an API and not consume directly from their site.

In your API, it captures the HTML file, caches and makes available in your application, from time to time you update your API with the generated print HTML.

good luck :)

Browser other questions tagged

You are not signed in. Login or sign up in order to post.