Categorygithub.com/thewizardplusplus/go-html-selector
modulepackage
1.6.1
Repository: https://github.com/thewizardplusplus/go-html-selector.git
Documentation: pkg.go.dev

# README

go-html-selector

GoDoc Go Report Card Build Status codecov

The library that implements collecting text, specified HTML tags, and their attributes from an HTML document.

Features

  • collecting from an HTML document:
    • HTML tags;
    • HTML attributes;
    • text (optional);
  • options:
    • filters:
      • filtering a result:
        • by specified HTML tags:
          • with support of the universal tag (*);
        • by specified HTML attributes;
      • friendly representation of filters:
        • for parsing from JSON;
        • for definition as a code literal;
    • skipping empty entities (separately optional):
      • tags without attributes;
      • attributes with an empty value;
      • whitespace-only text;
  • representing a result:
    • using the builder interface for building a result;
    • built-in builders:
      • with grouping HTML attributes by their tags;
      • with collecting only values of HTML attributes;
      • with collecting only text:
        • with support of merging with other builders;
  • optimizations:
    • of searching for a right filter among others:
      • with removing duplicate attribute filters, if there are the same filters for the universal tag;
    • of conversion from a byte slice to a string;
    • by the number:
      • of memory allocations;
      • of string copies.

Installation

Prepare the directory:

$ mkdir --parents "$(go env GOPATH)/src/github.com/thewizardplusplus/"
$ cd "$(go env GOPATH)/src/github.com/thewizardplusplus/"

Clone this repository:

$ git clone https://github.com/thewizardplusplus/go-html-selector.git
$ cd go-html-selector

Install dependencies with the dep tool:

$ dep ensure -vendor-only

Examples

htmlselector.SelectTags() with the structural builder:

package main

import (
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				<a href="http://example.com/1">1</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2">
				</video>
			</li>
			<li>
				<a href="http://example.com/2">2</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2">
				</video>
			</li>
			<li>
				<a href="">3</a>
				<video src="" poster=""></video>
			</li>
			<li>
				<a>4</a>
				<video></video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		"a":     {"href"},
		"video": {"src", "poster"},
	})

	var builder builders.StructuralBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		&builder,
		htmlselector.SkipEmptyTags(),
		htmlselector.SkipEmptyAttributes(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, tag := range builder.Tags() {
		fmt.Printf("<%s>:\n", tag.Name)
		for _, attribute := range tag.Attributes {
			fmt.Printf("  %s=%q\n", attribute.Name, attribute.Value)
		}
	}

	// Output:
	// <a>:
	//   href="http://example.com/1"
	// <video>:
	//   src="http://example.com/1.1"
	//   poster="http://example.com/1.2"
	// <a>:
	//   href="http://example.com/2"
	// <video>:
	//   src="http://example.com/2.1"
	//   poster="http://example.com/2.2"
}

htmlselector.SelectTags() with the flatten builder:

package main

import (
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				<a href="http://example.com/1">1</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2">
				</video>
			</li>
			<li>
				<a href="http://example.com/2">2</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2">
				</video>
			</li>
			<li>
				<a href="">3</a>
				<video src="" poster=""></video>
			</li>
			<li>
				<a>4</a>
				<video></video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		"a":     {"href"},
		"video": {"src", "poster"},
	})

	var builder builders.FlattenBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		&builder,
		htmlselector.SkipEmptyAttributes(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, attributeValue := range builder.AttributeValues() {
		fmt.Printf("%q\n", attributeValue)
	}

	// Output:
	// "http://example.com/1"
	// "http://example.com/1.1"
	// "http://example.com/1.2"
	// "http://example.com/2"
	// "http://example.com/2.1"
	// "http://example.com/2.2"
}

htmlselector.SelectTags() with the universal tag:

package main

import (
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				<a href="http://example.com/1" title="link #1">1</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2"
					title="video #1">
				</video>
			</li>
			<li>
				<a href="http://example.com/2" title="link #2">2</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2"
					title="video #2">
				</video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		htmlselector.UniversalTag: {"title", "href"},
		"video":                   {"src", "poster"},
	})

	var builder builders.StructuralBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		&builder,
		htmlselector.SkipEmptyTags(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, tag := range builder.Tags() {
		fmt.Printf("<%s>:\n", tag.Name)
		for _, attribute := range tag.Attributes {
			fmt.Printf("  %s=%q\n", attribute.Name, attribute.Value)
		}
	}

	// Output:
	// <a>:
	//   href="http://example.com/1"
	//   title="link #1"
	// <video>:
	//   src="http://example.com/1.1"
	//   poster="http://example.com/1.2"
	//   title="video #1"
	// <a>:
	//   href="http://example.com/2"
	//   title="link #2"
	// <video>:
	//   src="http://example.com/2.1"
	//   poster="http://example.com/2.2"
	//   title="video #2"
}

htmlselector.SelectTags() with the text builder:

package main

import (
	"bytes"
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				link #1: <a href="http://example.com/1">one</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2">
					Unable to embed video #1.
				</video>
			</li>
			<li>
				link #2: <a href="http://example.com/2">two</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2">
					Unable to embed video #2.
				</video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		"a":     {"href"},
		"video": {"src", "poster"},
	})

	var builder builders.StructuralBuilder
	var textBuilder builders.TextBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		htmlselector.MultiBuilder{Builder: &builder, TextBuilder: &textBuilder},
		htmlselector.SkipEmptyText(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, tag := range builder.Tags() {
		fmt.Printf("<%s>:\n", tag.Name)
		for _, attribute := range tag.Attributes {
			fmt.Printf("  %s=%q\n", attribute.Name, attribute.Value)
		}
	}

	fmt.Println("text parts:")
	for _, textPart := range textBuilder.TextParts() {
		textPart = bytes.TrimSpace(textPart)
		fmt.Printf("  %q\n", textPart)
	}

	// Output:
	// <a>:
	//   href="http://example.com/1"
	// <video>:
	//   src="http://example.com/1.1"
	//   poster="http://example.com/1.2"
	// <a>:
	//   href="http://example.com/2"
	// <video>:
	//   src="http://example.com/2.1"
	//   poster="http://example.com/2.2"
	// text parts:
	//   "link #1:"
	//   "one"
	//   "Unable to embed video #1."
	//   "link #2:"
	//   "two"
	//   "Unable to embed video #2."
}

Benchmarks

With Structural Builder

htmlselector.SelectTags() with a simple markup:

BenchmarkSelectTags/structural_builder/simple_markup/10_tags/430B-8         	  200000	     12572 ns/op	     8124 B/op	      35 allocs/op
BenchmarkSelectTags/structural_builder/simple_markup/100_tags/4.4K-8        	   20000	     93708 ns/op	    49167 B/op	     305 allocs/op
BenchmarkSelectTags/structural_builder/simple_markup/1000_tags/45.7K-8      	    3000	    716388 ns/op	   367374 B/op	    3005 allocs/op
BenchmarkSelectTags/structural_builder/simple_markup/10000_tags/476.3K-8    	     300	   7021473 ns/op	  2772749 B/op	   30005 allocs/op
BenchmarkSelectTags/structural_builder/simple_markup/100000_tags/4.8M-8     	      20	  59393679 ns/op	  8104370 B/op	  300005 allocs/op
BenchmarkSelectTags/structural_builder/simple_markup/1000000_tags/50.3M-8   	       1	1166593360 ns/op	998999904 B/op	 3000006 allocs/op

htmlselector.SelectTags() with a complex markup:

BenchmarkSelectTags/structural_builder/complex_markup/10_tags/1020B-8       	  100000	     18614 ns/op	     8117 B/op	     106 allocs/op
BenchmarkSelectTags/structural_builder/complex_markup/100_tags/10.4K-8      	   10000	    194201 ns/op	   155986 B/op	    1006 allocs/op
BenchmarkSelectTags/structural_builder/complex_markup/1000_tags/108.8K-8    	    1000	   1725249 ns/op	   379632 B/op	   10006 allocs/op
BenchmarkSelectTags/structural_builder/complex_markup/10000_tags/1.1M-8     	     100	  23956279 ns/op	 18179332 B/op	  100006 allocs/op
BenchmarkSelectTags/structural_builder/complex_markup/100000_tags/11.6M-8   	      10	 184968462 ns/op	 38395632 B/op	 1000006 allocs/op
BenchmarkSelectTags/structural_builder/complex_markup/1000000_tags/120.6M-8 	       1	1857636429 ns/op	383995632 B/op	10000006 allocs/op

htmlselector.SelectTags() with the universal tag:

BenchmarkSelectTags/structural_builder/universal_tag/10_tags/1020B-8         	   50000	     25557 ns/op	    18370 B/op	     126 allocs/op
BenchmarkSelectTags/structural_builder/universal_tag/100_tags/10.4K-8        	   10000	    311815 ns/op	   168108 B/op	    1206 allocs/op
BenchmarkSelectTags/structural_builder/universal_tag/1000_tags/108.8K-8      	    1000	   2407620 ns/op	  1437633 B/op	   12006 allocs/op
BenchmarkSelectTags/structural_builder/universal_tag/10000_tags/1.1M-8       	     100	  22065997 ns/op	 11180076 B/op	  120006 allocs/op
BenchmarkSelectTags/structural_builder/universal_tag/100000_tags/11.6M-8     	      10	 445495615 ns/op	244945105 B/op	 1200006 allocs/op
BenchmarkSelectTags/structural_builder/universal_tag/1000000_tags/120.6M-8   	       1	2307457875 ns/op	383996112 B/op	12000006 allocs/op

With Flatten Builder

htmlselector.SelectTags() with a simple markup:

BenchmarkSelectTags/flatten_builder/simple_markup/10_tags/430B-8            	  200000	      6244 ns/op	     5971 B/op	      15 allocs/op
BenchmarkSelectTags/flatten_builder/simple_markup/100_tags/4.4K-8           	   30000	     54042 ns/op	    21038 B/op	     105 allocs/op
BenchmarkSelectTags/flatten_builder/simple_markup/1000_tags/45.7K-8         	    3000	    530844 ns/op	   191624 B/op	    1005 allocs/op
BenchmarkSelectTags/flatten_builder/simple_markup/10000_tags/476.3K-8       	     300	   5116874 ns/op	  1402546 B/op	   10005 allocs/op
BenchmarkSelectTags/flatten_builder/simple_markup/100000_tags/4.8M-8        	      20	  54404882 ns/op	 23420178 B/op	  100005 allocs/op
BenchmarkSelectTags/flatten_builder/simple_markup/1000000_tags/50.3M-8      	       2	 663629704 ns/op	284703000 B/op	 1000005 allocs/op

htmlselector.SelectTags() with a complex markup:

BenchmarkSelectTags/flatten_builder/complex_markup/10_tags/1020B-8          	  100000	     15029 ns/op	     11411 B/op	      46 allocs/op
BenchmarkSelectTags/flatten_builder/complex_markup/100_tags/10.4K-8         	   10000	    150423 ns/op	     90183 B/op	     406 allocs/op
BenchmarkSelectTags/flatten_builder/complex_markup/1000_tags/108.8K-8       	    1000	   1268568 ns/op	     74576 B/op	    4006 allocs/op
BenchmarkSelectTags/flatten_builder/complex_markup/10000_tags/1.1M-8        	     100	  19005918 ns/op	  10593772 B/op	   40006 allocs/op
BenchmarkSelectTags/flatten_builder/complex_markup/100000_tags/11.6M-8      	      10	 138218081 ns/op	   7442576 B/op	  400006 allocs/op
BenchmarkSelectTags/flatten_builder/complex_markup/1000000_tags/120.6M-8    	       1	2141494094 ns/op	1313346192 B/op	 4000007 allocs/op

htmlselector.SelectTags() with the universal tag:

BenchmarkSelectTags/flatten_builder/universal_tag/10_tags/1020B-8            	  100000	     17471 ns/op	     9823 B/op	      46 allocs/op
BenchmarkSelectTags/flatten_builder/universal_tag/100_tags/10.4K-8           	   10000	    140121 ns/op	    57790 B/op	     406 allocs/op
BenchmarkSelectTags/flatten_builder/universal_tag/1000_tags/108.8K-8         	    1000	   1577272 ns/op	   802345 B/op	    4006 allocs/op
BenchmarkSelectTags/flatten_builder/universal_tag/10000_tags/1.1M-8          	     100	  15151013 ns/op	  5776548 B/op	   40006 allocs/op
BenchmarkSelectTags/flatten_builder/universal_tag/100000_tags/11.6M-8        	      10	 164593709 ns/op	 70617641 B/op	  400006 allocs/op
BenchmarkSelectTags/flatten_builder/universal_tag/1000000_tags/120.6M-8      	       1	1620865858 ns/op	869134992 B/op	 4000007 allocs/op

With Structural & Text Builders

htmlselector.SelectTags() with a simple markup:

BenchmarkSelectTags/text_builder/simple_markup/10_tags/430B-8         	  122985	     10339 ns/op	     9466 B/op	      45 allocs/op
BenchmarkSelectTags/text_builder/simple_markup/100_tags/4.4K-8        	   15040	     78519 ns/op	    48475 B/op	     405 allocs/op
BenchmarkSelectTags/text_builder/simple_markup/1000_tags/45.7K-8      	    1990	    813642 ns/op	   510223 B/op	    4005 allocs/op
BenchmarkSelectTags/text_builder/simple_markup/10000_tags/476.3K-8    	     160	   6728975 ns/op	  3817896 B/op	   40005 allocs/op
BenchmarkSelectTags/text_builder/simple_markup/100000_tags/4.8M-8     	      19	  75982499 ns/op	 56745964 B/op	  400005 allocs/op
BenchmarkSelectTags/text_builder/simple_markup/1000000_tags/50.3M-8   	       2	 664071080 ns/op	290158432 B/op	 4000005 allocs/op

htmlselector.SelectTags() with a complex markup:

BenchmarkSelectTags/text_builder/complex_markup/10_tags/1020B-8       	   68089	     20964 ns/op	    21599 B/op	     116 allocs/op
BenchmarkSelectTags/text_builder/complex_markup/100_tags/10.4K-8      	    7014	    167202 ns/op	    41712 B/op	    1106 allocs/op
BenchmarkSelectTags/text_builder/complex_markup/1000_tags/108.8K-8    	     717	   1663950 ns/op	   387312 B/op	   11006 allocs/op
BenchmarkSelectTags/text_builder/complex_markup/10000_tags/1.1M-8     	      69	  23245678 ns/op	 27798263 B/op	  110006 allocs/op
BenchmarkSelectTags/text_builder/complex_markup/100000_tags/11.6M-8   	       6	 173781906 ns/op	 38403312 B/op	 1100006 allocs/op
BenchmarkSelectTags/text_builder/complex_markup/1000000_tags/120.6M-8 	       1	1740418254 ns/op	384003328 B/op	11000006 allocs/op

htmlselector.SelectTags() with the universal tag:

BenchmarkSelectTags/text_builder/universal_tag/10_tags/1020B-8        	   61375	     19060 ns/op	     8168 B/op	     136 allocs/op
BenchmarkSelectTags/text_builder/universal_tag/100_tags/10.4K-8       	    6638	    258572 ns/op	   407046 B/op	    1306 allocs/op
BenchmarkSelectTags/text_builder/universal_tag/1000_tags/108.8K-8     	     615	   1775232 ns/op	   387360 B/op	   13006 allocs/op
BenchmarkSelectTags/text_builder/universal_tag/10000_tags/1.1M-8      	      57	  27496277 ns/op	 43162804 B/op	  130006 allocs/op
BenchmarkSelectTags/text_builder/universal_tag/100000_tags/11.6M-8    	       6	 187202961 ns/op	 38403378 B/op	 1300006 allocs/op
BenchmarkSelectTags/text_builder/universal_tag/1000000_tags/120.6M-8  	       1	1854043284 ns/op	384003408 B/op	13000006 allocs/op

License

The MIT License (MIT)

Copyright © 2020 thewizardplusplus

# Packages

No description provided by the author
No description provided by the author

# Functions

OptimizeFilters ...
SelectTags ...
SkipEmptyAttributes ...
SkipEmptyTags ...
SkipEmptyText ...

# Constants

# Structs

MultiBuilder ...
OptionConfig ...

# Interfaces

Builder ...
TextBuilder ...

# Type aliases

AttributeName ...
FilterGroup ...
OptimizedAttributeFilterGroup ...
OptimizedFilterGroup ...
Option ...
TagName ...