|
@@ -1,18 +1,30 @@
|
|
|
# tiktoken
|
|
|
|
|
|
-openai模型BPE标记
|
|
|
+openai开源的BPE(Byte pair encoding)算法,进行文本分割,标记,词性标注,词频统计等。
|
|
|
|
|
|
## Usage
|
|
|
|
|
|
```
|
|
|
pip install tiktoken
|
|
|
+```
|
|
|
+
|
|
|
+文本分割
|
|
|
+```
|
|
|
+from tiktoken import Tokenizer
|
|
|
+tokenizer = Tokenizer()
|
|
|
+text = "hello, world!"
|
|
|
+tokens = tokenizer.tokenize(text)
|
|
|
+print(tokens)
|
|
|
+```
|
|
|
+
|
|
|
|
|
|
+```
|
|
|
import tiktoken
|
|
|
-import torch
|
|
|
|
|
|
text = f"""
|
|
|
hello world
|
|
|
"""
|
|
|
+# tiktoken.encoding_for_model("gpt-4")
|
|
|
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
|
|
|
|
# Encode
|
|
@@ -22,4 +34,15 @@ print(tokens);
|
|
|
# Decode
|
|
|
[encoding.decode_single_token_bytes(token) for token in tokens]
|
|
|
|
|
|
+```
|
|
|
+
|
|
|
+词频统计
|
|
|
+```
|
|
|
+from tiktoken import Tokenizer
|
|
|
+tokenizer = Tokenizer()
|
|
|
+text = "I am runing, I am happy, I like runing."
|
|
|
+tokens = tokenizer.tokenize(text)
|
|
|
+work_counts= tokenizer.count_words(tokens)
|
|
|
+print(word_counts)
|
|
|
+
|
|
|
```
|