modulepackage
0.1.1
Repository: https://github.com/importcjj/go-readability.git
Documentation: pkg.go.dev
# README
Go-Readability
Go-Readability is a Go package that cleans a HTML page from clutter like buttons, ads and background images, and changes the page's text size, contrast and layout for better readability.
This package is fork from readability and go-readability, which inspired by readability for node.js and readability for python.
Why fork ?
There are severals reasons as to why I create a new fork instead sending a PR to original repository. Cause I need:
- Extract images
- Readable mix HTML tags
- Custom line break
Example
package main
import (
"fmt"
nurl "net/url"
"time"
"github.com/importcjj/go-readability"
)
func main() {
// Create URL
url := "https://www.nytimes.com/2018/01/21/technology/inside-amazon-go-a-store-of-the-future.html"
parsedURL, _ := nurl.Parse(url)
extractor := &readability.Extractor{
TextLineBreak: "<br/><br/>",
TextWithImgTag: true,
}
// Fetch readable content
article, err := extractor.FromURL(parsedURL, 5*time.Second)
if err != nil {
panic(err)
}
// Show results
fmt.Println(article.Meta.Title)
fmt.Println(article.Meta.Excerpt)
fmt.Println(article.Meta.Author)
// readable content
fmt.Println(article.Text)
// Tidy HTML
fmt.Println(article.HTML)
// Images
fmt.Println(article.Images)
}
# Functions
No description provided by the author
FromReader get readable content from the specified io.Reader.
FromURL get readable content from the specified URL.
GetHTMLContent fetch and cleans the raw html from article.
GetTextContent fetch and cleans the text from article.
No description provided by the author
No description provided by the author
# Structs
Article is the content of an URL.
Extractor ...
Metadata is metadata of an article.
No description provided by the author
# Type aliases
No description provided by the author