Categorygithub.com/shapor/tiktoken-go
repositorypackage
0.0.0-20231005054651-7cc2a6fce10e
Repository: https://github.com/shapor/tiktoken-go.git
Documentation: pkg.go.dev

# Packages

No description provided by the author

# README

tiktoken-go

OpenAI's tiktoken in Go.

Tiktoken is a fast BPE tokeniser for use with OpenAI's models.

This is a port of the original tiktoken.

Usage

Install

go get github.com/shapor/tiktoken-go

Examples

Get Token By Encoding

package main

import (
    "fmt"
    "github.com/shapor/tiktoken-go"
)

func main()  {
	text := "Hello, world!"
	encoding := "cl100k_base"

	tke, err := tiktoken.GetEncoding(encoding)
	if err != nil {
		err = fmt.Errorf("getEncoding: %v", err)
		return
	}

	// encode
	token := tke.Encode(text, nil, nil)

	//tokens
	fmt.Println((token))
	// num_tokens
	fmt.Println(len(token))
}

Get Token By Model

package main

import (
    "fmt"
    "github.com/shapor/tiktoken-go"
)

func main()  {
	text := "Hello, world!"
	encoding := "gpt-3.5-turbo"

	tkm, err := tiktoken.EncodingForModel(encoding)
	if err != nil {
		err = fmt.Errorf("getEncoding: %v", err)
		return
	}

	// encode
	token := tkm.Encode(text, nil, nil)

	// tokens
	fmt.Println(token)
	// num_tokens
	fmt.Println(len(token))
}

Counting Tokens For Chat API Calls

Below is an example function for counting tokens for messages passed to gpt-3.5-turbo or gpt-4.

The following code was written based on openai-cookbook examples at Wednesday, 28 June 2023.

Please note that the token calculation method for the message may change at any time, so this code may not necessarily be applicable in the future.

If you need accurate calculation, please refer to the official documentation.

If you find that this code is no longer applicable, please feel free to submit a PR or Issue.

package main

import (
	"fmt"

	"github.com/shapor/tiktoken-go"
	"github.com/sashabaranov/go-openai"
)

// OpenAI Cookbook: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string) (numTokens int) {
	tkm, err := tiktoken.EncodingForModel(model)
	if err != nil {
		err = fmt.Errorf("encoding for model: %v", err)
		log.Println(err)
		return
	}

	var tokensPerMessage, tokensPerName int
	switch model {
	case "gpt-3.5-turbo-0613",
		"gpt-3.5-turbo-16k-0613",
		"gpt-4-0314",
		"gpt-4-32k-0314",
		"gpt-4-0613",
		"gpt-4-32k-0613":
		tokensPerMessage = 3
		tokensPerName = 1
	case "gpt-3.5-turbo-0301":
		tokensPerMessage = 4 // every message follows <|start|>{role/name}\n{content}<|end|>\n
		tokensPerName = -1   // if there's a name, the role is omitted
	default:
		if strings.Contains(model, "gpt-3.5-turbo") {
			log.Println("warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
			return NumTokensFromMessages(messages, "gpt-3.5-turbo-0613")
		} else if strings.Contains(model, "gpt-4") {
			log.Println("warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
			return NumTokensFromMessages(messages, "gpt-4-0613")
		} else {
			err = fmt.Errorf("num_tokens_from_messages() is not implemented for model %s. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.", model)
			log.Println(err)
			return
		}
	}

	for _, message := range messages {
		numTokens += tokensPerMessage
		numTokens += len(tkm.Encode(message.Content, nil, nil))
		numTokens += len(tkm.Encode(message.Role, nil, nil))
		numTokens += len(tkm.Encode(message.Name, nil, nil))
		if message.Name != "" {
			numTokens += tokensPerName
		}
	}
	numTokens += 3 // every reply is primed with <|start|>assistant<|message|>
	return numTokens
}

Available Encodings

Encoding nameOpenAI models
cl100k_basegpt-4, gpt-3.5-turbo, text-embedding-ada-002
p50k_baseCodex models, text-davinci-002, text-davinci-003
r50k_base (or gpt2)GPT-3 models like davinci

Available Models

Model nameOpenAI models
gpt-4-*cl100k_base
gpt-3.5-turbo-*cl100k_base
gpt-4cl100k_base
gpt-3.5-turbocl100k_base
text-davinci-003p50k_base
text-davinci-002p50k_base
text-davinci-001r50k_base
text-curie-001r50k_base
text-babbage-001r50k_base
text-ada-001r50k_base
davincir50k_base
curier50k_base
babbager50k_base
adar50k_base
code-davinci-002p50k_base
code-davinci-001p50k_base
code-cushman-002p50k_base
code-cushman-001p50k_base
davinci-codexp50k_base
cushman-codexp50k_base
text-davinci-edit-001p50k_edit
code-davinci-edit-001p50k_edit
text-embedding-ada-002cl100k_base
text-similarity-davinci-001r50k_base
text-similarity-curie-001r50k_base
text-similarity-babbage-001r50k_base
text-similarity-ada-001r50k_base
text-search-davinci-doc-001r50k_base
text-search-curie-doc-001r50k_base
text-search-babbage-doc-001r50k_base
text-search-ada-doc-001r50k_base
code-search-babbage-code-001r50k_base
code-search-ada-code-001r50k_base
gpt2gpt2

Test

you can run test in test folder

compare with original tiktoken

get token by encoding

result

get token by model

result

Benchmark

you can run benchmark in test folder

Benchmark result

nametime/oposcputexttimes
tiktoken-go8795nsmacOS 13.2Apple M1UDHR100000
tiktoken8838nsmacOS 13.2Apple M1UDHR100000

It looks like the performance is almost the same.

Maybe the difference is due to the difference in the performance of the machine.

Or maybe my benchmark method is not appropriate.

If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR.

License

MIT