read content from a pdf using javascript client side

Asked

Viewed 601 times

-2

I have a PDF that I load via input and I need to get the contents of the file via Javascript without the use of Node.js "server side". I was able to get the contents in Base64, but it’s not readable.

Code I’m using:

<input id="inputFile" type="file" onchange="convertToBase64();" />

<script type="text/javascript">
    function convertToBase64() {
        var selectedFile = document.getElementById("inputFile").files;
        if (selectedFile.length > 0) {
            var fileToLoad = selectedFile[0];
            var fileReader = new FileReader();
            var base64;
            fileReader.onload = function(fileLoadedEvent) {
                base64 = fileLoadedEvent.target.result;
            };
            fileReader.readAsDataURL(fileToLoad);
            console.log(fileReader);
        }
    }
</script>

1 answer

1


You can do this on the client side using this library

https://mozilla.github.io/pdf.js/getting_started/

I made an example based on this tutorial

https://ourcodeworld.com/articles/read/405/how-to-convert-pdf-to-text-extract-text-from-pdf-with-javascript

I installed the two files from the library in the same folder as HTML and read the selected PDF in the input file. The result appears in the browser console, it creates an array, where each item is the text of a page

<!DOCTYPE html>
<html lang="en">
	<head>
		<title></title>
		<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">
		<script src="pdf.js"></script>
	</head>

	<body>
		<h1>PDF.js</h1>
		
		<input id="inputFile" type="file" onchange="convertToBase64();" />
		<script>
			pdfjsLib.workerSrc = 'pdf.worker.js';
			 function convertToBase64() {
				var selectedFile = document.getElementById("inputFile").files;
				if (selectedFile.length > 0) {
					var fileToLoad = selectedFile[0];
					var fileReader = new FileReader();
					var base64;
					fileReader.onload = function(fileLoadedEvent) {
						base64 = fileLoadedEvent.target.result;
						lerPDF(base64);
					};
					fileReader.readAsDataURL(fileToLoad);
				}
			}
			
			function lerPDF(fileToLoad) {
					pdfjsLib.getDocument(fileToLoad).then(function (pdf) {
					var pdfDocument = pdf;
					var pagesPromises = [];

					for (var i = 0; i < pdf.numPages; i++) {
						// Required to prevent that i is always the total of pages
						(function (pageNumber) {
							pagesPromises.push(getPageText(pageNumber, pdfDocument));
						})(i + 1);
					}

					Promise.all(pagesPromises).then(function (pagesText) {

						// Display text of all the pages in the console
						console.log(pagesText);
					});

				}, function (reason) {
					// PDF loading error
					console.error(reason);
				});

					
				
			}	

			/**
			 * Retrieves the text of a specif page within a PDF Document obtained through pdf.js 
			 * 
			 * @param {Integer} pageNum Specifies the number of the page 
			 * @param {PDFDocument} PDFDocumentInstance The PDF document obtained 
			 **/
			function getPageText(pageNum, PDFDocumentInstance) {
				// Return a Promise that is solved once the text of the page is retrieven
				return new Promise(function (resolve, reject) {
					PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
						// The main trick to obtain the text of the PDF page, use the getTextContent method
						pdfPage.getTextContent().then(function (textContent) {
							var textItems = textContent.items;
							var finalString = "";

							// Concatenate the string of the item to the final string
							for (var i = 0; i < textItems.length; i++) {
								var item = textItems[i];

								finalString += item.str + " ";
							}

							// Solve promise with the text retrieven from the page
							resolve(finalString);
						});
					});
				});
			}
		</script>
	</body>
</html>

  • worked out thanks!!

Browser other questions tagged

You are not signed in. Login or sign up in order to post.