Java-Python的完全对齐的tokenizer(字级别)
原创
©著作权归作者所有:来自51CTO博客作者TechOnly的原创作品,请联系作者获取转载授权,否则将追究法律责任
python侧:
def tokenize_to_str_list(textString):
split_tokens = []
for i in range(len(textString)):
split_tokens.append(textString[i])
return split_tokens
def convert_to_int_list(split_tokens):
output = []
for token in split_tokens:
if token in char2id:
output.append(char2id[item])
return
java侧:
public String[] tokenize_to_str_list(final String textString) {
int textLength = textString.length();
String[] split_tokens = new String[textLength];
for(int i=0; i < textLength; i++){
split_tokens[i]= String.valueOf(textString.charAt(i));
}
return split_tokens;
}
public int[] convert_to_int_list(final String[] split_tokens) {
int seqLen = split_tokens.length;
int[] output = new int[seqLen];
int index = 0
for(int i = 0; i < seqLen; i++){
if(char2id.containsKey(split_tokens[i])){
output[index] = char2id.get(split_tokens[i]);
index = index + 1;
}
}
return output;
}