A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from https://www.npmjs.com/package/pdf-text-extract below:

pdf-text-extract - npm

PDF Text Extract

Extract text from pdfs that contain searchable pdf text. The module is wrapper that calls the pdftotext command to perform the actual extraction

Installation

npm install --save pdf-text-extract

You will need the pdftotext binary available on your path. There are packages available for many different operating systems

See https://github.com/nisaacson/pdf-extract#osx for how to install the pdftotext command

Usage As a module

extract(filePath, [options], [pdftotextcommand], callback)

Options and pdftotextcommand are not required.

var path = require('path')

var filePath = path.join(__dirname, 'test/data/multipage.pdf')

var extract = require('pdf-text-extract')

extract(filePath, function (err, pages) {

  if (err) {

    console.dir(err)

    return

  }

  console.dir(pages)

})

The output will be an array of where each entry is a page of text. If you want just a string of all pages you can set the option to splitPages: false.

var filePath = path.join(__dirname, 'test/data/multipage.pdf')

var extract = require('pdf-text-extract')

extract(filePath, { splitPages: false }, function (err, text) {

  if (err) {

    console.dir(err)

    return

  }

  console.dir(text)

})

You can set the following options:

If needed you can pass an optional arguments to the extract function. These will be passed to the child_process.spawn call.

var filePath = path.join(__dirname, 'test/data/multipage.pdf')

var extract = require('pdf-text-extract')

var options = {

  cwd: "./"

}

extract(filePath, options, function (err, pages) {

  if (err) {

    console.dir(err)

    return

  }

  console.dir('extracted pages', pages)

})

You can also override the command for pdftotext if it is installed in a location that is not available in the PATH environment variable

var filePath = path.join(__dirname, 'test/data/multipage.pdf')

var pdfToTextCommand = '/opt/bin/pdftotext'

var extract = require('pdf-text-extract')

var options = {

  cwd: "./"

}

extract(filePath, options, pdfToTextCommand, function (err, pages) {

  if (err) {

    console.dir(err)

    return

  }

  console.dir('extracted pages', pages)

})

As a command line tool

npm install -g pdf-text-extract

Execute with the filePath as an argument. Output will be json-formatted array of pages

pdf-text-extract ./test/data/multipage.pdf

Test

RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4