Categorygithub.com/branow/htmlscraper
module
0.4.0
Repository: https://github.com/branow/htmlscraper.git
Documentation: pkg.go.dev

# README

htmlscraper - automated HTML scraping with jQuery-like selectors in Go

Build Status Go Report Card PkgGoDev

Table of Contents

Installation

To install httpscraper, use go get:

go get github.com/branow/httpscraper

This will then make the following package available to you:

github.com/branow/httpscraper/scrape

To update httpscraper to the latest version, use go get -u github.com/branow/httpscraper.

We currently support the most recent major Go versions from 1.23 onward.

Examples

Lets scrape the following body tag of catalog.html file.

<body>
    <div class="container">
        <h1>Product Catalog</h1>
        <div class="catalog">
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 1">
                <h2>Product 1</h2>
                <p>Great product for your needs.</p>
                <p class="price">$29.99</p>
            </div>
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 2">
                <h2>Product 2</h2>
                <p>Top-rated product with excellent reviews.</p>
                <p class="price">$39.99</p>
            </div>
            <div class="product">
                <img src="https://via.placeholder.com/200" alt="Product 3">
                <h2>Product 3</h2>
                <p>Best value for your money.</p>
                <p class="price">$19.99</p>
            </div>
            <div class="product">
                <h2>Product 4</h2>
                <p>The product that you want to buy.</p>
                <p class="price">$10.99</p>
            </div>
        </div>
    </div>
</body>

Scrape the catalog name

package examples

import (
	"fmt"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
)

func ScrapeString() {
	// create goquery document
	file := getCatalogFile() //get catalog.html
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create Scraper
	scraper := scrape.Scraper{}

	// scraping
	var catalog string //product catalog name
	err = scraper.Scrape(doc, &catalog, ".container > h1", "text")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:", catalog)
}

It prints:

Got Error: <nil>
Got Output: Product Catalog

The example file.

Scrape the product names

package examples

import (
	"fmt"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
)

func ScrapeSliceOfStrings() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create Scraper
	scraper := scrape.Scraper{}

	// scraping
	var products []string //product names
	err = scraper.Scrape(doc, &products, ".product > h2", "text")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:", products)
}

It prints:

Got Error: <nil>
Got Output: [Product 1 Product 2 Product 3 Product 4]

The example file.

Scrape the products

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapeSliceOfStructs() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       string `select:"img" extract:"@src"`
	}
	var products []Product
	err = scraper.Scrape(doc, &products, ".product", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	for _, p := range products {
		fmt.Println(p)
	}
}

It prints:

Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }

The example file.

Scrape the catalog

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapeStruct() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       string `select:"img" extract:"@src"`
	}
	type Catalog struct {
		Name     string    `select:"h1" extract:"text"`
		Products []Product `select:".product"`
	}
	var catalog Catalog
	err = scraper.Scrape(doc, &catalog, ".container", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	fmt.Println("Catalog {")
	fmt.Println(catalog.Name)
	for _, p := range catalog.Products {
		fmt.Println(p)
	}
	fmt.Println("}")
}

It prints:

Got Error: scrape: .container .product:n(3) img no nodes found
Got Output:
Catalog {
Product Catalog
{Product 1 Great product for your needs. 29.99 https://via.placeholder.com/200}
{Product 2 Top-rated product with excellent reviews. 39.99 https://via.placeholder.com/200}
{Product 3 Best value for your money. 19.99 https://via.placeholder.com/200}
{Product 4 The product that you want to buy. 10.99 }
}

The example file.

Scrape absent data

If some data could be absent in an HTML document, pointers should be used. Pointers let you save some memory but first and foremost checking whether a pointer is nil gives you information about the absence of data in the HTML document.

package examples

import (
	"fmt"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/branow/htmlscraper/scrape"
	"golang.org/x/net/html"
)

func ScrapePointers() {
	// create goquery document
	file := getCatalogFile()
	defer file.Close()
	doc, err := goquery.NewDocumentFromReader(file)
	raisePanic(err)

	// create custom extractor for price data
	priceMatch := scrape.GetEqualMatch("*price")
	priceExtractor := func(node *html.Node, extract string) (string, error) {
		price := node.FirstChild.Data
		return strings.Replace(price, "$", "", 1), nil
	}
	customExtractors := map[*scrape.Match]scrape.Extractor{&priceMatch: priceExtractor}

	// create Scraper
	scraper := scrape.Scraper{Mode: scrape.Tolerant, Extractors: customExtractors}

	// scraping
	type Image struct {
		Src string `extract:"@src"`
		Alt string `extract:"@alt"`
	}
	type Product struct {
		Name        string `select:"h2" extract:"text"`
		Description string `select:"p" extract:"text"`
		Price       string `select:".price" extract:"*price"`
		Image       *Image `select:"img"`
	}
	var products []Product
	err = scraper.Scrape(doc, &products, ".product", "")

	// get output
	fmt.Println("Got Error:", err)
	fmt.Println("Got Output:")
	for _, p := range products {
		fmt.Println(p)
	}
}

It prints:

Got Error: scrape: .product:n(3) img no nodes found
Got Output:
{Product 1 Great product for your needs. 29.99 0xc00009ade0}
{Product 2 Top-rated product with excellent reviews. 39.99 0xc00009ae80}
{Product 3 Best value for your money. 19.99 0xc00009af20}
{Product 4 The product that you want to buy. 10.99 <nil>}

The example file.

Contributing

Please feel free to submit issues, fork the repository and send pull requests!

License

This project is licensed under the terms of the MIT license.

# Packages

No description provided by the author
Package scrape implements scraping functionality for extracting useful data from HTTP text using jQuery-like selectors.