modulepackage
0.0.0-20240225173854-ea193637b663
Repository: https://github.com/imajacket/pdf.git
Documentation: pkg.go.dev
# README
PDF Reader
A simple Go library which enables reading PDF files.
This repository contains merged code from https://github.com/rsc/pdf and https://github.com/ledongthuc/pdf
Features
- Get plain text content (without formatting)
- Get Content (including all font and formatting information)
Install:
go get -u github.com/dslipak/pdf
Read plain text
package main
import (
"bytes"
"fmt"
"github.com/dslipak/pdf"
)
func main() {
pdf.DebugOn = true
content, err := readPdf("test.pdf") // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
// remember close file
defer f.Close()
if err != nil {
return "", err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}
Read all text with styles from PDF
func readPdf2(path string) (string, error) {
f, r, err := pdf.Open(path)
// remember close file
defer f.Close()
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}
Read text grouped by rows
package main
import (
"fmt"
"os"
"github.com/dslipak/pdf"
)
func main() {
content, err := readPdf(os.Args[1]) // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
defer func() {
_ = f.Close()
}()
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
rows, _ := p.GetTextByRow()
for _, row := range rows {
println(">>>> row: ", row.Position)
for _, word := range row.Content {
fmt.Println(word.S)
}
}
}
return "", nil
}
# Functions
Interpret interprets the content in a stream as a basic PostScript program, pushing values onto a stack and then calling the do function to execute operators.
NewReader opens a file for reading, using the data in f with the given total size.
NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
Open opens a file for reading.
# Variables
No description provided by the author
# Structs
Column represents the contents of a column.
Content describes the basic content on a page: the text and any drawn rectangles.
A Font represent a font in a PDF file.
An Outline is a tree describing the outline (also known as the table of contents) of a document.
A Page represent a single page in a PDF file.
A Point represents an X, Y pair.
A Reader is a single PDF file open for reading.
A Rect represents a rectangle.
Row represents the contents of a row.
A Stack represents a stack of values.
A Text represents a single piece of text drawn on a page.
A Value is a single PDF value, such as an integer, dictionary, or array.
# Interfaces
A TextEncoding represents a mapping between font code points and UTF-8 text.
# Type aliases
Columns is a list of column.
Rows is a list of rows.
TextHorizontal implements sort.Interface for sorting a slice of Text values in horizontal order, left to right, and then top to bottom within a column.
TextVertical implements sort.Interface for sorting a slice of Text values in vertical order, top to bottom, and then left to right within a line.
A ValueKind specifies the kind of data underlying a Value.