step2.py 806 B

1234567891011121314151617181920212223
  1. import json
  2. def create_one_hot_encoding(vocabulary, word):
  3. encoding = [0] * len(vocabulary)
  4. if word in vocabulary:
  5. encoding[vocabulary[word]] = 1
  6. return encoding
  7. # 读取文本文件中的词表(使用 UTF-8 编码)
  8. with open("./vocabulary.txt", "r", encoding="utf-8") as file:
  9. loaded_vocabulary = [word.strip() for word in file.readlines()]
  10. # 创建词汇表字典
  11. vocabulary_dict = {word: index for index, word in enumerate(loaded_vocabulary)}
  12. # 创建 one-hot 编码字典
  13. one_hot_dict = {word: create_one_hot_encoding(vocabulary_dict, word) for word in vocabulary_dict}
  14. # 保存为 JSON 文件(使用 UTF-8 编码)
  15. with open("./one_hot_dict.json", "w", encoding="utf-8") as json_file:
  16. json.dump(one_hot_dict, json_file, ensure_ascii=False)
  17. print(one_hot_dict)