Categorygithub.com/imajacket/pdf
modulepackage
0.0.0-20240225173854-ea193637b663
Repository: https://github.com/imajacket/pdf.git
Documentation: pkg.go.dev

# README

PDF Reader

A simple Go library which enables reading PDF files.

This repository contains merged code from https://github.com/rsc/pdf and https://github.com/ledongthuc/pdf

Features

  • Get plain text content (without formatting)
  • Get Content (including all font and formatting information)

Install:

go get -u github.com/dslipak/pdf

Read plain text

package main

import (
	"bytes"
	"fmt"

	"github.com/dslipak/pdf"
)

func main() {
	pdf.DebugOn = true
	content, err := readPdf("test.pdf") // Read local pdf file
	if err != nil {
		panic(err)
	}
	fmt.Println(content)
	return
}

func readPdf(path string) (string, error) {
	f, r, err := pdf.Open(path)
	// remember close file
    defer f.Close()
	if err != nil {
		return "", err
	}
	var buf bytes.Buffer
    b, err := r.GetPlainText()
    if err != nil {
        return "", err
    }
    buf.ReadFrom(b)
	return buf.String(), nil
}

Read all text with styles from PDF

func readPdf2(path string) (string, error) {
	f, r, err := pdf.Open(path)
	// remember close file
	defer f.Close()
	if err != nil {
		return "", err
	}
	totalPage := r.NumPage()

	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}
		var lastTextStyle pdf.Text
		texts := p.Content().Text
		for _, text := range texts {
			if isSameSentence(text, lastTextStyle) {
				lastTextStyle.S = lastTextStyle.S + text.S
			} else {
				fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
				lastTextStyle = text
			}
		}
	}
	return "", nil
}

Read text grouped by rows

package main

import (
	"fmt"
	"os"

	"github.com/dslipak/pdf"
)

func main() {
	content, err := readPdf(os.Args[1]) // Read local pdf file
	if err != nil {
		panic(err)
	}
	fmt.Println(content)
	return
}

func readPdf(path string) (string, error) {
	f, r, err := pdf.Open(path)
	defer func() {
		_ = f.Close()
	}()
	if err != nil {
		return "", err
	}
	totalPage := r.NumPage()

	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
		p := r.Page(pageIndex)
		if p.V.IsNull() {
			continue
		}

		rows, _ := p.GetTextByRow()
		for _, row := range rows {
		    println(">>>> row: ", row.Position)
		    for _, word := range row.Content {
		        fmt.Println(word.S)
		    }
		}
	}
	return "", nil
}

# Functions

Interpret interprets the content in a stream as a basic PostScript program, pushing values onto a stack and then calling the do function to execute operators.
NewReader opens a file for reading, using the data in f with the given total size.
NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
Open opens a file for reading.

# Constants

The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.
The PDF value kinds.

# Variables

No description provided by the author

# Structs

Column represents the contents of a column.
Content describes the basic content on a page: the text and any drawn rectangles.
A Font represent a font in a PDF file.
An Outline is a tree describing the outline (also known as the table of contents) of a document.
A Page represent a single page in a PDF file.
A Point represents an X, Y pair.
A Reader is a single PDF file open for reading.
A Rect represents a rectangle.
Row represents the contents of a row.
A Stack represents a stack of values.
A Text represents a single piece of text drawn on a page.
A Value is a single PDF value, such as an integer, dictionary, or array.

# Interfaces

A TextEncoding represents a mapping between font code points and UTF-8 text.

# Type aliases

Columns is a list of column.
Rows is a list of rows.
TextHorizontal implements sort.Interface for sorting a slice of Text values in horizontal order, left to right, and then top to bottom within a column.
TextVertical implements sort.Interface for sorting a slice of Text values in vertical order, top to bottom, and then left to right within a line.
A ValueKind specifies the kind of data underlying a Value.