How to index and search for files in Lucene

Question

How to index and search for files in Lucene

Asked 12 years ago

Viewed 2,110 times

5

I am trying to generate a Java file indexer with the help of Lucene.
I followed this guide from iMasters and tried to adapt to the version 4.7.0, the problem is that at some point the search is not working.
I checked the indexing file and both the file information and its contents are being indexed.
I wonder if you could help me?

Follows my code:

Indexador.java

package main;

import java.io.*;
import java.text.*;

import org.apache.log4j.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.apache.tika.*;

public class Indexador {
    private static Logger logger = Logger.getLogger(Indexador.class);
    // Diretório que irá guardar o índice;
    private String diretorioDosIndices = "C:\\Users\\strokes"
            + "\\Documents\\indice-lucene";
    // Diretório que contém os documentos que serão indexados;
    private String diretorioParaIndexar = "C:\\Users\\strokes"
            + "\\Downloads";
    // IndexWriter: cria e mantém o índice;
    private IndexWriter writer;
    // Biblioteca que extrai texto de diversos formatos conhecidos;
    private Tika tika;

    public static void main(String[] args) {
        Indexador indexador = new Indexador();
        indexador.indexaArquivosDoDiretorio();
    }

    public void indexaArquivosDoDiretorio() {
        try {
            File diretorio = new File(diretorioDosIndices);
            apagaIndices(diretorio);
            // Directory: representa o diretório do índice;
            Directory d = new SimpleFSDirectory(diretorio);
            logger.info("Diretório do índice: " + diretorioDosIndices);
            // Analyser/StandardAnalyser: fazem o pré-processamento do texto.
            // Existem analisadores inclusive em português;
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
            // IndexWriterConfig: configurações para criação do índice. No
            // projeto serão utilizados os valores padrão;
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
                    analyzer);
            // Inicializa o IndexWriter para gravação;
            writer = new IndexWriter(d, config);
            long inicio = System.currentTimeMillis();
            indexaArquivosDoDiretorio(new File(diretorioParaIndexar));
            // {12}
            writer.commit();
            writer.close();
            long fim = System.currentTimeMillis();
            logger.info("Tempo para indexar: " + ((fim - inicio) / 1000) + "s");
        } catch (IOException e) {
            logger.error(e);
        }
    }

    private void apagaIndices(File diretorio) {
        if (diretorio.exists()) {
            File arquivos[] = diretorio.listFiles();
            for (File arquivo : arquivos) {
                arquivo.delete();
            }
        }
    }

    public void indexaArquivosDoDiretorio(File raiz) {
        FilenameFilter filtro = new FilenameFilter() {
            public boolean accept(File arquivo, String nome) {
                if (nome.toLowerCase().endsWith(".pdf")
                        || nome.toLowerCase().endsWith(".odt")
                        || nome.toLowerCase().endsWith(".doc")
                        || nome.toLowerCase().endsWith(".docx")
                        || nome.toLowerCase().endsWith(".ppt")
                        || nome.toLowerCase().endsWith(".pptx")
                        || nome.toLowerCase().endsWith(".xls")
                        || nome.toLowerCase().endsWith(".txt")
                        || nome.toLowerCase().endsWith(".rtf")) {
                    return true;
                }
                return false;
            }
        };
        for (File arquivo : raiz.listFiles(filtro)) {
            if (arquivo.isFile()) {
                StringBuffer msg = new StringBuffer();
                msg.append("Indexando o arquivo ");
                msg.append(arquivo.getAbsoluteFile());
                msg.append(", ");
                msg.append(arquivo.length() / 1000);
                msg.append("kb");
                logger.info(msg);
                try {
                    // Extrai o conteúdo do arquivo com o Tika;
                    String textoExtraido = getTika().parseToString(arquivo);
                    indexaArquivo(arquivo, textoExtraido);
                } catch (Exception e) {
                    logger.error(e);
                }
            } else {
                indexaArquivosDoDiretorio(arquivo);
            }
        }
    }

    private void indexaArquivo(File arquivo, String textoExtraido) {
        SimpleDateFormat formatador = new SimpleDateFormat("yyyyMMdd");
        String ultimaModificacao = formatador.format(arquivo.lastModified());
        // Monta um Document para indexação
        // Field.Store.YES: armazena uma cópia do texto no índice, aumentando
        // muito o seu tamanho;
        // Field.Index.ANALYZED: utilizado quando o campo é de texto livre;
        // Field.Index.NOT_ANALYZED: utilizado quando o campo é um ID, data ou
        // númerico.
        Document documento = new Document();
        documento.add(new StringField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
        documento.add(new StringField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
        documento.add(new StringField("Texto", textoExtraido, Field.Store.YES));
        try {
            // Adiciona o Document no índice, mas este só estará disponível para
            // consulta após o commit.
            getWriter().addDocument(documento);
        } catch (IOException e) {
            logger.error(e);
        }
    }

    public Tika getTika() {
        if (tika == null) {
            tika = new Tika();
        }
        return tika;
    }

    public IndexWriter getWriter() {
        return writer;
    }
}

Buscador.java

package main;

import java.io.*;

import javax.swing.*;

import org.apache.log4j.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;

public class Buscador {
    private static Logger logger = Logger.getLogger(Buscador.class);
    private String diretorioDoIndice = "C:\\Users\\strokes"
            + "\\Documents\\indice-lucene";

    public void buscaComParser(String parametro) {
        try {
            Directory diretorio = new SimpleFSDirectory(new File(
                    diretorioDoIndice));
            // IndexReader: classe abstrata responsável por acessar o índice;
            IndexReader leitor = DirectoryReader.open(diretorio);
            // IndexSearcher: implementa os métodos necessários para realizar
            // buscas em um índice;
            IndexSearcher buscador = new IndexSearcher(leitor);
            Analyzer analisador = new StandardAnalyzer(Version.LUCENE_47);
            // QueryParser/Query: representa a consulta do usuário. Outros
            // exemplos de query podem ser vistos no Javadoc;
            QueryParser parser = new QueryParser(Version.LUCENE_47, "Texto",
                    analisador);
            Query consulta = parser.parse(parametro);
            long inicio = System.currentTimeMillis();
            // Realiza a busca e armazena o resultado em um TopDocs;
            TopDocs resultado = buscador.search(consulta, 100);
            long fim = System.currentTimeMillis();
            int totalDeOcorrencias = resultado.totalHits;
            logger.info("Total de documentos encontrados:" + totalDeOcorrencias);
            logger.info("Tempo total para busca: " + (fim - inicio) + "ms");
            // ScoreDoc: representa cada um dos documentos retornados na busca.
            for (ScoreDoc sd : resultado.scoreDocs) {
                Document documento = buscador.doc(sd.doc);
                logger.info("Caminho:" + documento.get("Caminho"));
                logger.info("Última modificação:"
                        + documento.get("UltimaModificacao"));
                logger.info("Score:" + sd.score);
                logger.info("--------");
            }
        } catch (Exception e) {
            logger.error(e);
        }
    }

    public static void main(String[] args) {
        Buscador b = new Buscador();
        String parametro = JOptionPane.showInputDialog("Consulta");
        b.buscaComParser(parametro);
    }
}

2 answers

3

Field "Text" is a Stringfield, which means it is not parsed, and all content will be indexed as a single token. It is effectively the same as, in Lucene 3.6, using Keywordanalyzer, or defining Field as Field.Index.NOT_ANALYZED. In Lucene 4.X, you should use Textfield as an alternative, which is the standard field for textual content, and thus be parsed.

Source: Can’t index and find files [Java, Lucene, Tika, Log4j]

Part of the modified code in the Indexer.java:

documento.add(new StringField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
documento.add(new StringField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
documento.add(new StringField("Texto", textoExtraido, Field.Store.YES));

To:

documento.add(new TextField("UltimaModificacao", ultimaModificacao, Field.Store.YES));
documento.add(new TextField("Caminho", arquivo.getAbsolutePath(), Field.Store.YES));
documento.add(new TextField("Texto", textoExtraido, Field.Store.YES));

Browser other questions tagged java lucene

You are not signed in. Login or sign up in order to post.

by Marco Reis • 96 points · Answer 1 · 2014-03-22T19:22:42+00:00

The Lucene 4 API is different from version 3. Below is a complete example of indexing and searching with Lucene 4.7.

package net.marcoreis.util;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.SimpleDateFormat;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;

public class Indexador {
    private static Logger logger = Logger.getLogger(Indexador.class);
    // {1}
    private String diretorioDosIndices = System.getProperty("user.home")
        + "/indice-lucene";
    // {2}
    private String diretorioParaIndexar = System.getProperty("user.home")
        + "/Dropbox/entrada";
    // {3}
    private IndexWriter writer;
    // {4}
    private Tika tika;

    public static void main(String[] args) {
    Indexador indexador = new Indexador();
    indexador.indexaArquivosDoDiretorio();
    }

    public void indexaArquivosDoDiretorio() {
    try {
        File diretorio = new File(diretorioDosIndices);
        apagaIndices(diretorio);
        // {5}
        Directory d = new SimpleFSDirectory(diretorio);
        logger.info("Diretorio do indice: " + diretorioDosIndices);
        // {6}
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
        // {7}
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
                analyzer);
        // {8}
        writer = new IndexWriter(d, config);
        long inicio = System.currentTimeMillis();
        indexaArquivosDoDiretorio(new File(diretorioParaIndexar));
        // {12}
        writer.commit();
        writer.close();
        long fim = System.currentTimeMillis();
        logger.info("Tempo para indexar: " + ((fim - inicio) / 1000) + "s");
    } catch (IOException e) {
        logger.error(e);
    }
    }

    private void apagaIndices(File diretorio) {
    if (diretorio.exists()) {
        File arquivos[] = diretorio.listFiles();
        for (File arquivo : arquivos) {
            arquivo.delete();
        }
    }
    }

    public void indexaArquivosDoDiretorio(File raiz) {
    FilenameFilter filtro = new FilenameFilter() {
        public boolean accept(File arquivo, String nome) {
            if (nome.toLowerCase().endsWith(".pdf")
                    || nome.toLowerCase().endsWith(".odt")
                    || nome.toLowerCase().endsWith(".doc")
                    || nome.toLowerCase().endsWith(".docx")
                    || nome.toLowerCase().endsWith(".ppt")
                    || nome.toLowerCase().endsWith(".pptx")
                    || nome.toLowerCase().endsWith(".xls")
                    || nome.toLowerCase().endsWith(".txt")
                    || nome.toLowerCase().endsWith(".rtf")) {
                return true;
            }
            return false;
        }
    };
    for (File arquivo : raiz.listFiles(filtro)) {
        if (arquivo.isFile()) {
            StringBuffer msg = new StringBuffer();
            msg.append("Indexando o arquivo ");
            msg.append(arquivo.getAbsoluteFile());
            msg.append(", ");
            msg.append(arquivo.length() / 1000);
            msg.append("kb");
            logger.info(msg);
            try {
                // {9}
                String textoExtraido = getTika().parseToString(arquivo);
                indexaArquivo(arquivo, textoExtraido);
            } catch (Exception e) {
                logger.error(e);
            }
        } else {
            indexaArquivosDoDiretorio(arquivo);
        }
    }
    }

    private void indexaArquivo(File arquivo, String textoExtraido) {
    SimpleDateFormat formatador = new SimpleDateFormat("yyyyMMdd");
    String ultimaModificacao = formatador.format(arquivo.lastModified());
    // {10}
    Document documento = new Document();
    documento.add(new TextField("UltimaModificacao", ultimaModificacao,
            Store.YES));
    documento.add(new TextField("Caminho", arquivo.getAbsolutePath(),
            Store.YES));
    documento.add(new TextField("Texto", textoExtraido, Store.YES));
    try {
        // {11}
        getWriter().addDocument(documento);
    } catch (IOException e) {
        logger.error(e);
    }
    }

    public Tika getTika() {
    if (tika == null) {
        tika = new Tika();
    }
    return tika;
    }

    public IndexWriter getWriter() {
    return writer;
    }
}

~~~~~~~~~~~~~~~~~~~~~

package net.marcoreis.util;

import java.io.File;

import javax.swing.JOptionPane;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class Buscador {
    private static Logger logger = Logger.getLogger(Buscador.class);
    private String diretorioDoIndice = System.getProperty("user.home")
        + "/indice-lucene";

    public void buscaComParser(String parametro) {
    try {
        Directory diretorio = new SimpleFSDirectory(new File(
                diretorioDoIndice));
        // {1}
        IndexReader leitor = DirectoryReader.open(diretorio);
        // {2}
        IndexSearcher buscador = new IndexSearcher(leitor);
        Analyzer analisador = new StandardAnalyzer(Version.LUCENE_47);
        // {3}
        QueryParser parser = new QueryParser(Version.LUCENE_47, "Texto",
                analisador);
        Query consulta = parser.parse(parametro);
        long inicio = System.currentTimeMillis();
        // {4}
        TopDocs resultado = buscador.search(consulta, 100);
        long fim = System.currentTimeMillis();
        int totalDeOcorrencias = resultado.totalHits;
        logger.info("Total de documentos encontrados:" + totalDeOcorrencias);
        logger.info("Tempo total para busca: " + (fim - inicio) + "ms");
        // {5}
        for (ScoreDoc sd : resultado.scoreDocs) {
            Document documento = buscador.doc(sd.doc);
            logger.info("Caminho:" + documento.get("Caminho"));
            logger.info("Ultima modificacao:"
                    + documento.get("UltimaModificacao"));
            logger.info("Score:" + sd.score);
            logger.info("--------");
        }
        leitor.close();
    } catch (Exception e) {
        logger.error(e);
    }
    }

    public static void main(String[] args) {
    Buscador b = new Buscador();
    String parametro = JOptionPane.showInputDialog("Consulta");
    b.buscaComParser(parametro);
    }
}