LabelBinarizer进行单分类和多分类one-hot编码
此种场景适用的字符串, 之间没有天然内在顺序
5.1 Encoding Nominal Categorical Feature¶
feature
# 加载库 使用LabelBinarizer 进行one-hot编码
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
feature = np.array([
["Texas"],
["California"],
["Texas"],
["Delaware"],
["Texas"]
])
array([[0, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 0, 1]])
feature
array([['Texas'],
['California'],
['Texas'],
['Delaware'],
['Texas']], dtype='<U10')
# feature
# create one-hot encoder 创建 one-hot编码
one_hot = LabelBinarizer()
# one-hot encode feature 进行编码
one_hot.fit_transform(feature)
array([[0, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 0, 1]])
查看特征分类
# view feature classes 查看特征分类
one_hot.classes_
array(['California', 'Delaware', 'Texas'], dtype='<U10')
逆转换
# reverse one-hot encoding 逆转换
one_hot.inverse_transform(one_hot.transform(feature))
array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')
变量
import pandas as pd
# 加载库 创建虚拟变量
pd.get_dummies(feature[:, 0])
California Delaware Texas
0 0 0 1
1 1 0 0
2 0 0 1
3 0 1 0
4 0 0 1
# create multiclass feature 处理多个分类特征
multiclass_feature = [
("Texas", "Florida"),
("California", "Alabama"),
("Texas", "Florida"),
("Delaware", "Florida"),
("Texas", "Alabama")
]
multiclass_feature
# create multiclass one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()
# one-hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)
array([[0, 0, 0, 1, 1],
[1, 1, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 0],
[1, 0, 0, 0, 1]])
# view classes 查看分类
one_hot_multiclass.classes_
array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
dtype=object)