Categorygithub.com/meinside/simple-scrapper-go
modulepackage
0.0.9
Repository: https://github.com/meinside/simple-scrapper-go.git
Documentation: pkg.go.dev

# README

simple-scrapper-go

A simple web scrapper written in Golang.

Fetch contents from URLs with headless browsers, and remove unwanted elements from them with goquery & etc..

Usage

package main

import (
	"log"
	"strings"

	ssg "github.com/meinside/simple-scrapper-go"
)

const (
	returnAsHTML = true
)

func main() {
	urls := []string{
		"https://x.com/elonmusk/status/1813223978884079966",
		"https://www.reddit.com/r/IAmA/comments/2rgsan/i_am_elon_musk_ceocto_of_a_rocket_company_ama/",
	}

	// create client
	if client, err := ssg.NewScrapper(); err == nil {
		client.SetURLReplacer(func(url string) string {
			// NOTE: use `old.reddit.com` instead of `www.reddit.com`
			if strings.HasPrefix(url, "https://www.reddit.com/") {
				return strings.Replace(url, "www.reddit.com", "old.reddit.com", 1)
			}
			return url
		})
		client.SetSelectorReturner(func(url string) string {
			// NOTE: select `div[data-testid="tweetText"]` for `x.com`
			if strings.Contains(url, "x.com/") {
				return `div[data-testid="tweetText"]`
			}
			return `body`
		})

		// crawl urls
		if crawled, err := client.CrawlURLs(urls, returnAsHTML); err == nil {
			log.Printf("crawled things: %+v", crawled)
		} else {
			log.Printf("failed to crawl urls: %s", err)
		}

		// close client
		if err := client.Close(); err != nil {
			log.Printf("failed to close scrapper: %s", err)
		}
	} else {
		log.Printf("failed to create scrapper: %s", err)
	}
}

Known Issues

  • Scrapping reddit's URLs often fails.

License

MIT

# Functions

CrawlURLs crawls contents from given `urls`.
NewScrapper creates a new scrapper client.

# Structs

Scrapper struct.