@classmethod def from_dict(cls, json_object): config = BertConfig(vocab_size=None) for (key, value) in six.iteritems(json_object): config.__dict__[key] = value return config
@classmethod def from_json_file(cls, json_file): with tf.gfile.GFile(json_file, "r") as reader: text = reader.read() return cls.from_dict(json.loads(text))
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): all_documents = [[]]#外层是文档,内层是文档中的每个句子 for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() if not line:# 空行表示文档分割 all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng): cand_indexes = [] # [CLS]和[SEP]不能用于MASK for (i, token) in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and token.startswith("##")): cand_indexes[-1].append(i) else: cand_indexes.append([i]) rng.shuffle(cand_indexes) output_tokens = list(tokens) num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) masked_lms = [] covered_indexes = set() for index_set in cand_indexes: if len(masked_lms) >= num_to_predict: break if len(masked_lms) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes: is_any_index_covered = True break if is_any_index_covered: continue for index in index_set: covered_indexes.add(index) masked_token = None # 80% of the time, replace with [MASK] if rng.random() < 0.8: masked_token = "[MASK]" else: # 10% of the time, keep original if rng.random() < 0.5: masked_token = tokens[index] # 10% of the time, replace with random word else: masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] output_tokens[index] = masked_token masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) assert len(masked_lms) <= num_to_predict # 按照下标重排,保证是原来句子中出现的顺序 masked_lms = sorted(masked_lms, key=lambda x: x.index) masked_lm_positions = [] masked_lm_labels = [] for p in masked_lms: masked_lm_positions.append(p.index) masked_lm_labels.append(p.label) return (output_tokens, masked_lm_positions, masked_lm_labels)
class FullTokenizer(object): def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text): split_tokens = [] for token in self.basic_tokenizer.tokenize(text): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) return split_tokens
class BasicTokenizer(object): def __init__(self, do_lower_case=True): self.do_lower_case = do_lower_case
def tokenize(self, text): ##其实就是把字符串转为了list,分英文单词和中文单词处理 ##eg:Mr. Cassius crossed the highway, and stopped suddenly.转为['mr', '.', 'cassius', 'crossed', 'the', 'highway', ',', 'and', 'stopped', 'suddenly', '.'] text = convert_to_unicode(text) text = self._clean_text(text) text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text)#无需细说,就是把string按照空格切分为list split_tokens = [] for token in orig_tokens: if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token)#这个函数干了什么我也没看明白,但是对正题流程不重要,略过吧 split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def _run_strip_accents(self, text): text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output)
def _run_split_on_punc(self, text): chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text): # 按字切分中文,其实就是英文单词不变,中文在字两侧添加空格 output = [] for char in text: cp = ord(char) if self._is_chinese_char(cp): output.append(" ") output.append(char) output.append(" ") else: output.append(char) return "".join(output)
def _is_chinese_char(self, cp): # 判断是否是汉字,这个函数很有意义,值得借鉴 # refer:https://www.cnblogs.com/straybirds/p/6392306.html if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False
def _clean_text(self, text): # 去除无意义字符以及空格 output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or _is_control(char): continue if _is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output)