modulepackage
0.0.2
Repository: https://github.com/heapstacktree/easychars.git
Documentation: pkg.go.dev
# README
easychars
Based on saintfish/chardet and golang.org/x/text/encoding/ , easychars makes it convient to detect the charset and convert content to UTF-8 encoded.
Support charset
-
Unicode: UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE
-
Simplified Chinese: GB2312, GBK, GB18030(include GB2312 and GBK)
-
Tranditional Chinese: Big5, EUC-TW
-
Janpanese: EUC-JP, Shift_JIS, ISO-2022-JP
-
Korean: EUC-KR, ISO-2022-KR
-
Russian:
-
Others: ISO-8859-1, ISO-8859-2, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-9, Windows-1250, Windows-1251, Windows-1254, Windows-1255, Windows-1256 ...
For other charsets, try easychars.ToUtf8WithCharsetName
to test whether it's supported
Example
package main
import (
"fmt"
"github.com/HeapStackTree/easychars"
"os"
)
func ReadAndConvertFile(path string, charsetName string) (contentInUtf8 []byte, res *charset.Result, err error) {
res = &charset.Result{
Charset: "unknown",
Language: "unknown",
Confidence: 0,
Convertible: false,
}
content, err := os.ReadFile(path)
if err != nil {
return
}
if charsetName == "" {
contentInUtf8, res, err = easychars.DetectAndConvertToUtf8(content)
} else {
contentInUtf8, err = easychars.ToUtf8WithCharsetName(content, charsetName)
if err == nil {
res.Charset = charsetName
res.Confidence = 100
res.Convertible = true
}
}
return
}
func main() {
path := "tests/GB2312/_mozilla_bug171813_text.html"
// use charset name if you are sure about it
content, res, err := ReadAndConvertFile(path, "")
if err != nil {
return
}
// jump ascii parts
var gbkLoc int
for i, v := range content {
if v >= 0x7F {
gbkLoc = i
break
}
}
fmt.Printf("Path: %s\nCharset: %s\nLanguage: %s\nConfidence: %d\nConvetible: %t\nContent: %s\n", path, res.Charset, res.Language, res.Confidence, res.Convertible, content[gbkLoc:])
// Ouput should be:
// Charset: GB-18030
// Language: zh
// Confidence: 100
// Convetible: true
// Content: 搜狐在线</b></font></a></div> ...
}
Check godoc for other methods.
# Functions
DetectAll returns all chardet.Results which have non-zero Confidence.
Detect and convert content to UTF-8 encoded.
DetectEncoding return the Result with highest Confidence.
GetDecoderFromCharsetName return Decoder for given charset name (case insensitive).
GetEncodingFromCharsetName return encoding.Encoding for given charset name (case insensitive).
Check whether content is valid under UTF-8 rule.
Get UTF-8 encoded []byte with charset name.
Get UTF-8 encoded []byte with Decoder.
Get UTF-8 encoded []byte with encoding.Encoding.
# Interfaces
alias for transform.Transformer.