modulepackage
0.0.0-20170913195834-7423397b5b61
Repository: https://github.com/ntns/pdf.git
Documentation: pkg.go.dev
# README
PDF Reader
A simple Go library which enables reading PDF files.
Features
- Get plain text content (without format)
- Get Content (including all font and formatting information)
Install:
go get -u github.com/rsc/pdf
Read plain text
package main
import (
"bytes"
"fmt"
"github.com/rsc/pdf"
)
func main() {
content, err := readPdf("test.pdf") // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
var buf bytes.Buffer
buf.ReadFrom(p.GetPlainText())
return buf.String(), nil
}
Read all text with styles from PDF
func readPdf2(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}
Demo
# Packages
Pdfpasswd searches for the password for an encrypted PDF by trying all strings over a given alphabet up to a given length.
# Functions
Interpret interprets the content in a stream as a basic PostScript program, pushing values onto a stack and then calling the do function to execute operators.
NewReader opens a file for reading, using the data in f with the given total size.
NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
Open opens a file for reading.
# Variables
No description provided by the author
# Structs
Content describes the basic content on a page: the text and any drawn rectangles.
A Font represent a font in a PDF file.
An Outline is a tree describing the outline (also known as the table of contents) of a document.
A Page represent a single page in a PDF file.
A Point represents an X, Y pair.
A Reader is a single PDF file open for reading.
A Rect represents a rectangle.
A Stack represents a stack of values.
A Text represents a single piece of text drawn on a page.
A Value is a single PDF value, such as an integer, dictionary, or array.
# Interfaces
A TextEncoding represents a mapping between font code points and UTF-8 text.
# Type aliases
TextHorizontal implements sort.Interface for sorting a slice of Text values in horizontal order, left to right, and then top to bottom within a column.
TextVertical implements sort.Interface for sorting a slice of Text values in vertical order, top to bottom, and then left to right within a line.
A ValueKind specifies the kind of data underlying a Value.