🍦 机器学习是一种分析数据并学习预测结果的程序，是让计算机从研究和统计数据中学习，是迈向人工智能方向的一步。

1 机器学习

机器学习
- 数据类型三大类：数值的、绝对的、序数。
  - 数值数据是数字，两个类别：离散数据(仅限于整数)、连续数据(具有无限价值)。
  - 分类数据是无法相互比较的值，而序数数据类似于分类数据，但是可以相互比较。
- 平均数mean、中位数median、众数mode、标准差std。
- 方差var、方差的平方根=标准差、百分位数percentile()。

import numpy
from scipy import stats
from warnings import simplefilter                            # 忽略警告信息
simplefilter(action="ignore", category=FutureWarning)

speed = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]

a = numpy.percentile(speed, 80)                              # 查找百分位数
b = numpy.median(speed)                                      # 中位数
c = numpy.mean(speed)                                        # 平均数
d = numpy.std(speed)                                         # 标准差，方差的平方根，通常用符号Sigma表示σ
e = numpy.var(speed)                                         # 方差，Sigma Squared指σ2
f = stats.mode(speed)                                        # 众数

print(a)
print(b)
print(c)
print(d)
print(e)                                                     # 方差，每个值与平均数的差值的平方值的和的平均数
print(f)

1-1 数据分布

数据分布
- 正态数据分布：高斯数据分布，正态分布图也被叫做钟形曲线，具有钟形特征。
- 随机数据分布：测试算法时可能没有真实世界的数据，此时需用随机生成的值。

(1) 直方图

import numpy
import matplotlib.pyplot as plt

x = numpy.random.uniform(0.0, 5.0, 250)                       # 250个介于0和5之间的随机浮点数数组

print(x)
plt.hist(x, 5)                                                # 绘制直方图
plt.show()

(2) 正态分布

import numpy
import matplotlib.pyplot as plt

x = numpy.random.normal(5.0, 1.0, 100000)                     # 100000个平均数为5.0，标准差为1.0的数组

print(x)
plt.hist(x, 100)                                              # 正态数据分布，绘制包含100个柱的直方图
plt.show()

(3) 散点图

import matplotlib.pyplot as plt                               # Matplotlib有绘制散点图的方法，需两个相同长度的数组

x = [5, 7, 8, 7, 2, 17, 2, 9, 4, 11, 12, 9, 6]                # x数组代表每辆汽车的年龄，y数组代表每辆车的速度
y = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]

plt.scatter(x, y)                                             # 绘制散点图
plt.show()                                                    # 越新的车越快，也可能是巧合，数据量少

(4) 随机分布

import numpy
import matplotlib.pyplot as plt

x = numpy.random.normal(5.0, 1.0, 1000)                       # 1000个数组，平均值为5.0，标准差为1.0的随机数
y = numpy.random.normal(10.0, 2.0, 1000)                      # 1000个数组，平均值为10.0，标准差为2.0的随机数

plt.scatter(x, y)                                             # 随机数据分布，具有1000个点的散点图
plt.show()

1-2 线性回归

线性回归
- 知道x和y轴的值之间的关系，没有关系则线性回归不能用于预测结果。
- R关系的取值范围为-1到1，0表示没有关系，1和-1则表示100%相关。

from scipy import stats
import matplotlib.pyplot as plt                              # 13辆汽车经过收费站时的车龄和速度

x = [5, 7, 8, 7, 2, 17, 2, 9, 4, 11, 12, 9, 6]
y = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]
slope, intercept, r, p, std_err = stats.linregress(x, y)


def func(x):                                                 # 创建一个使用slope和intercept值返回新值的函数
    return slope * x + intercept                             # 这个新值表示对应的x值在y轴上的位置


model = list(map(func, x))                                   # 运行x数组的每个值，生成一个新数组，包含y轴的新值
plt.scatter(x, y)                                            # 绘制原始散点图
plt.plot(x, model)                                           # 绘制线性回归线
plt.show()                                                   # 显示图表

(1) R关系

from scipy import stats

x = [5, 7, 8, 7, 2, 17, 2, 9, 4, 11, 12, 9, 6]                # 数据在线性回归中的拟合程度
y = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]

slope, intercept, r, p, std_err = stats.linregress(x, y)
print(r)                                                      # -0.76表明存在关系，但并不完美

(2) 预测未来价值

from scipy import stats

x = [5, 7, 8, 7, 2, 17, 2, 9, 4, 11, 12, 9, 6]
y = [99, 86, 87, 88, 111, 86, 103, 87, 94, 78, 77, 85, 86]

slope, intercept, r, p, std_err = stats.linregress(x, y)


def func(x):
    return slope * x + intercept


speed = func(10)                                             # 预测一辆10年车龄的汽车速度
print(speed)                                                 # 85.6，也可以从图表中读取

(3) 非预测最佳法

from scipy import stats                                      # 线性回归不是预测未来值的最佳方法
import matplotlib.pyplot as plt

x = [89, 43, 36, 36, 95, 10, 66, 34, 38, 20, 26, 29, 48, 64, 6, 5, 36, 66, 72, 40]
y = [21, 46, 3, 35, 67, 95, 53, 72, 58, 10, 26, 34, 90, 33, 38, 20, 56, 2, 47, 15]
slope, intercept, r, p, std_err = stats.linregress(x, y)
print(r)                                                     # 0.013，得到的R值小，表明关系很差，不适合做线性回归


def func(x):
    return slope * x + intercept


model = list(map(func, x))
plt.scatter(x, y)
plt.plot(x, model)
plt.show()                                                   # x轴和y轴的值导致线性回归的拟合非常差

1-3 多项式回归

多项式回归
- 若x和y轴的值之间没有关系，则多项式回归不能用于预测结果。
- R平方值的取值范围为0到1，0表示没有关系，1表示100%相关。

import numpy
import matplotlib.pyplot as plt                              # 收集了通过某收费站的18辆汽车超车发生时间和汽车速度

x = [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 22]
y = [100, 90, 80, 60, 60, 55, 60, 65, 70, 70, 75, 76, 78, 79, 90, 99, 99, 100]

model = numpy.poly1d(numpy.polyfit(x, y, 3))                 # 注意poly1d是1d不是ld，建立多项式模型
lines = numpy.linspace(1, 22, 100)                           # 指定该行将如何显示，从位置1开始，到位置22结束

plt.scatter(x, y)                                            # 绘制原始散点图
plt.plot(lines, model(lines))                                # 绘制多项式回归线
plt.show()                                                   # 显示图表

(1) R平方

import numpy
from sklearn.metrics import r2_score

x = [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 22]
y = [100, 90, 80, 60, 60, 55, 60, 65, 70, 70, 75, 76, 78, 79, 90, 99, 99, 100]

model = numpy.poly1d(numpy.polyfit(x, y, 3))                  # 注意poly1d是1d不是ld，建立多项式模型
print(r2_score(y, model(x)))                                  # 数据在多项式回归中的拟合程度，0.94表明关系非常好

(2) 预测未来价值

import numpy

x = [1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 22]
y = [100, 90, 80, 60, 60, 55, 60, 65, 70, 70, 75, 76, 78, 79, 90, 99, 99, 100]

model = numpy.poly1d(numpy.polyfit(x, y, 3))                  # 注意poly1d是1d不是ld，建立多项式模型
speed = model(17)                                             # 预测17:00左右通过某收费站的汽车速度
print(speed)                                                  # 88.87，也可以从图中读取该值

(3) 非预测最佳法

import numpy
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score                         # 多项式回归不是预测未来值的最佳方法

x = [89, 43, 36, 36, 95, 10, 66, 34, 38, 20, 26, 29, 48, 64, 6, 5, 36, 66, 72, 40]
y = [21, 46, 3, 35, 67, 95, 53, 72, 58, 10, 26, 34, 90, 33, 38, 20, 56, 2, 47, 15]

model = numpy.poly1d(numpy.polyfit(x, y, 3))                 # 注意poly1d是1d不是ld，建立多项式模型
lines = numpy.linspace(2, 95, 100)
print(r2_score(y, model(x)))                                 # 0.00995表明关系很差，该数据集不适合做多项式回归

plt.scatter(x, y)
plt.plot(lines, model(lines))
plt.show()                                                   # x轴和y轴的值导致多项式回归的拟合非常差

1-4 多元线性回归

import pandas
from sklearn import linear_model                             # data.csv改名为vehicle_info.csv

df = pandas.read_csv("data/vehicle_info.csv")                # https://www.w3schools.com/python/data.csv

X = df[["Weight", "Volume"]]                                 # 列出独立值并调用X变量
y = df["CO2"]                                                # 将依赖值放在一个名为y的变量中

regr = linear_model.LinearRegression()                       # 创建一个线性回归对象，将独立值和相关值作为参数
regr.fit(X.values, y.values)                                 # 并用描述关系的数据填充回归对象

# 根据汽车的重量和体积预测CO2值，预测一辆配备1.3升发动机、重量为2300kg的汽车每行驶一公里释放107克CO2
predictedCO2 = regr.predict([[2300, 1300]])
print(predictedCO2)

(1) 系数

import pandas                                                # 系数是描述与未知变量之间关系的因子
from sklearn import linear_model                             # data.csv改名为vehicle_info.csv

df = pandas.read_csv("data/vehicle_info.csv")                # https://www.w3schools.com/python/data.csv

X = df[["Weight", "Volume"]]                                 # 列出独立值并调用X变量
y = df["CO2"]                                                # 将依赖值放在一个名为y的变量中

regr = linear_model.LinearRegression()                       # 创建一个线性回归对象，将独立值和相关值作为参数
regr.fit(X.values, y.values)                                 # 并用描述关系的数据填充回归对象

# 体重增加1kg，CO2排放量就会增加0.00755095g，发动机尺寸增加1cm3，CO2排放量增加0.00780526g
print(regr.coef_)                                            # 回归对象的系数值[重量0.00755095 体积0.00780526]

(2) 预测未来价值

import pandas
from sklearn import linear_model                             # data.csv改名为vehicle_info.csv

df = pandas.read_csv("data/vehicle_info.csv")                # https://www.w3schools.com/python/data.csv

X = df[["Weight", "Volume"]]                                 # 列出独立值并调用X变量
y = df["CO2"]                                                # 将依赖值放在一个名为y的变量中

regr = linear_model.LinearRegression()                       # 创建一个线性回归对象，将独立值和相关值作为参数
regr.fit(X.values, y.values)                                 # 并用描述关系的数据填充回归对象

# 将权重从2300更改为3300，一辆配备1.3升发动机、重量为3300kg的汽车每行驶一公里将释放115克CO2
predictedCO2 = regr.predict([[3300, 1300]])
print(predictedCO2)

1-5 比例特征方法

比例特征方法
- 当数据具有不同的值，甚至不同的测量单位时，很难比较，可将数据缩放为容易比较的新值。
- 标准化方法使用的公式：z = (x - u) / s，z是新值，x是原始值，u是平均值，s是标准差。

(1) 比例特征缩放

import pandas
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()                                      # data.csv改名为vehicle_info.csv

df = pandas.read_csv("data/vehicle_info.csv")                 # https://www.w3schools.com/python/data.csv
X = df[["Weight", "Volume"]]                                  # 列出独立值并调用X变量
scaledX = scale.fit_transform(X)                              # 缩放重量和体积列中的所有值

print(scaledX)

(2) 预测CO2的值

import pandas
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()                                     # data.csv改名为vehicle_info.csv

df = pandas.read_csv("data/vehicle_info.csv")                # https://www.w3schools.com/python/data.csv

X = df[["Weight", "Volume"]]                                 # 列出独立值并调用X变量
y = df["CO2"]

scaledX = scale.fit_transform(X.values)                      # 缩放重量和体积列中的所有值

regr = linear_model.LinearRegression()
regr.fit(scaledX, y.values)

scaled = scale.transform([[2300, 1.3]])                      # 预测一辆重2300公斤的1.3升汽车的二氧化碳排放量

predictedCO2 = regr.predict([scaled[0]])
print(predictedCO2)

2 训练(测试)

训练(测试)
- 一种衡量模型准确性的方法，将数据集分成了两个组：训练集(训练模型)、测试集(测试模型)。
- 80%用于训练(训练模型意味着创建模型)，20%用于测试(测试模型意味着测试模型的准确性)。

2-1 数据集

import numpy
import matplotlib.pyplot as plt                               # 展示了一家商店中的100位顾客，以及他们的购物习惯

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)                            # x轴表示购买前的分钟数
y = numpy.random.normal(150, 40, 100) / x                     # y轴表示购买时花费的金额

plt.scatter(x, y)
plt.show()

2-2 显示训练集

import numpy
import matplotlib.pyplot as plt

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]                                             # 拆分成训练集，随机选择80%的原始数据
train_y = y[:80]

test_x = x[80:]                                              # 拆分成测试集，剩下的20%
test_y = y[80:]

plt.scatter(train_x, train_y)                                # 显示训练集，看起来像原始数据集
plt.show()

# plt.scatter(test_x, test_y)                                # 显示测试集，看起来像原始数据集
# plt.show()

2-3 拟合数据集

import numpy
import matplotlib.pyplot as plt

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]                                             # 拆分成训练集，随机选择80%的原始数据
train_y = y[:80]

test_x = x[80:]                                              # 拆分成测试集，剩下的20%
test_y = y[80:]

model = numpy.poly1d(numpy.polyfit(train_x, train_y, 4))
lines = numpy.linspace(0, 6, 100)                            # 通过数据点画一条线

plt.scatter(train_x, train_y)                                # 显示训练集
plt.plot(lines, model(lines))                                # 这条线指顾客在店里花6分钟可能购买价值200的商品
plt.show()                                                   # 有过度拟合的迹象

2-4 R平方分数

import numpy
from sklearn.metrics import r2_score

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]                                             # 拆分成训练集
train_y = y[:80]

test_x = x[80:]                                              # 拆分成测试集
test_y = y[80:]

model = numpy.poly1d(numpy.polyfit(train_x, train_y, 4))
r2 = r2_score(train_y, model(train_x))                       # 训练数据在多项式回归中的拟合程度
print(r2)                                                    # 结果0.799说明存在关系

2-5 引入测试集

import numpy
from sklearn.metrics import r2_score

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]                                             # 拆分成训练集
train_y = y[:80]

test_x = x[80:]                                              # 拆分成测试集
test_y = y[80:]

model = numpy.poly1d(numpy.polyfit(train_x, train_y, 4))
r2 = r2_score(test_y, model(test_x))
print(r2)                                                    # 结果0.809说明该模型适合测试集

2-6 预测未来价值

import numpy

numpy.random.seed(2)
x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]                                             # 拆分成训练集
train_y = y[:80]

test_x = x[80:]                                              # 拆分成测试集
test_y = y[80:]

model = numpy.poly1d(numpy.polyfit(train_x, train_y, 4))
print(model(5))                                              # 如果一位购物顾客在店里停留5分钟，预测花费22.88美元

3 常见分类算法

常见分类算法
- 监督学习算法，Supervised Algorithms。
  - 在监督学习训练过程中，可以由训练数据集学到或建立一个模式(函数/Learning model)，并依此模式推测新的实例。
  - 要求特定的输入/输出，首先需要决定使用哪种数据作为范例，包括神经网络、支持向量机、最近邻居法、决策树等。
- 无监督学习算法，Unsupervised Algorithms：没有特定的目标输出，算法将数据集分为不同的组。
- 强化学习算法，Reinforcement Algorithms。
  - 主要基于决策进行训练，根据输出结果的成功或错误训练，通过大量经验训练优化后的算法将能够给出较好的预测。
  - 在运筹学和控制论的语境下，强化学习被称作“近似动态规划”，即Approximate Dynamic Programming，简称ADP。

3-1 决策树

决策树
- 一个流程图，可根据以前的经验做出决策，决策树需满足所有数据都是数字。
- 运行决策树的次数足够多时，会看到它给出不同结果，即便提供了相同的数据。
- 决策树并未给定100%确定的答案，只是基于结果的概率，因而结果会有所不同。

import pandas

""" data/actor_info.csv
Age	Experience	Rank	Nationality	Go
36  10          9	    UK          NO
42  12          4	    USA         NO
23  4           6	    N           NO
52  4           4	    USA         NO
43  21          8	    USA         YES
44  14          5	    UK          NO
66  3           7	    N           YES
35  14          9	    UK          YES
52  13          7	    N           YES
35  5           9	    N           YES
24  3           5	    USA         NO
18  3           7	    UK          YES
45  9           9	    UK          YES
"""
df = pandas.read_csv("data/actor_info.csv")                  # 喜剧演员的信息，是否决定看节目
print(df)                                                    # 读取并打印数据集

(1) 数值转换

import pandas

df = pandas.read_csv("data/actor_info.csv")
ds = {"UK": 0, "USA": 1, "N": 2}                              # 将非数字列转换为数字列
df["Nationality"] = df["Nationality"].map(ds)
ds = {"YES": 1, "NO": 0}
df["Go"] = df["Go"].map(ds)

print(df)                                                     # 读取并打印数据集

(2) 创建决策树

import pandas
import graphviz                                              # 需要安装graphviz库和PlantUML插件
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

df = pandas.read_csv("data/actor_info.csv")
ds = {"UK": 0, "USA": 1, "N": 2}                             # 将非数字列转换为数字列
df["Nationality"] = df["Nationality"].map(ds)
ds = {"YES": 1, "NO": 0}
df["Go"] = df["Go"].map(ds)

features = ["Age", "Experience", "Rank", "Nationality"]
X = df[features]                                             # 特征列，尝试从中预测的列
y = df["Go"]                                                 # 目标列，尝试预测值的列

print(X)
print(y)

dtree = DecisionTreeClassifier()                             # 分割样本的GINI方法：Gini=1-(x/n)2+(y/n)2
dtree = dtree.fit(X, y)
# tree.plot_tree(dtree, feature_names=features)
# plt.savefig(sys.stdout.buffer)                             # 违背了可视化的初衷，简单粗暴，不建议使用
# sys.stdout.flush()

dot_data = tree.export_graphviz(dtree, feature_names=features)
graph = graphviz.Source(dot_data)
graph.render("data/decision_tree")                           # 生成decision_tree.pdf可视化决策树

(3) 预测未来价值

import pandas
from sklearn.tree import DecisionTreeClassifier

df = pandas.read_csv("data/actor_info.csv")
ds = {"UK": 0, "USA": 1, "N": 2}                             # 将非数字列转换为数值
df["Nationality"] = df["Nationality"].map(ds)
ds = {"YES": 1, "NO": 0}
df["Go"] = df["Go"].map(ds)

features = ["Age", "Experience", "Rank", "Nationality"]
X = df[features]                                             # 特征列，尝试从中预测的列
y = df["Go"]                                                 # 目标列，尝试预测值的列

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X.values, y.values)
print(dtree.predict([[40, 10, 6, 1]]))                       # 是否该看40岁10年经验喜剧排名为7的演员主演的节目

3-2 K-均值

K-均值
- 一种用于聚类数据点的无监督学习方法，该算法通过最小化每个簇中的方差将数据点迭代划分为K个簇。
  - 首先，每个数据点被随机分配到K个集群中的任意一个，然后，计算每个集群的质心，即功能上的中心。
  - 并将每个数据点重新分配给具有最接近质心的集群，重复该过程，直到每个数据点的集群分配不再变化。
- K-均值聚类要求选择K，即要将数据分组到的聚类数。eblow方法绘制惯性(基于距离的度量)。
- 并可视化惯性开始线性下降的点，该点被称为eblow，是根据数据对K最佳值的一个很好估计。

import matplotlib.pyplot as plt

x = [4, 5, 10, 4, 3, 11, 14, 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

plt.scatter(x, y)
plt.show()                                                    # 可视化一些数据点

(1) eblow法

import warnings
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

x = [4, 5, 10, 4, 3, 11, 14, 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))                                       # 将数据转换为一组点

inertias = []                                                # 找到K的最佳值，需对数据运行K均值以获得一系列可能值
for i in range(1, 11):                                       # 只有10个数据点，因此最大簇数为10
    kmeans = KMeans(n_clusters=i)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmeans.fit(data)
    inertias.append(kmeans.inertia_)

plt.plot(range(1, 11), inertias, marker="o")
plt.title("Elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")                                        # 利用eblow法可视化不同K值的惯量
plt.show()                                                   # 图中值2是K的一个很好的值

(2) 重新训练

import warnings
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

x = [4, 5, 10, 4, 3, 11, 14, 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))                                       # 将数据转换为一组点

kmeans = KMeans(n_clusters=2)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kmeans.fit(data)

plt.scatter(x, y, c=kmeans.labels_)
plt.show()                                                   # 可视化数据点

3-3 混淆矩阵

混淆矩阵
- 一个用于分类问题的表格，用于评估模型中出现错误的位置，行代表结果应该属于的实际类别，列代表所做的预测。
- 创建的混淆矩阵有4个象限：真阴性、假阳性、假阴性、真阳性，True表准确预测值，False表存在错误或错误预测。
- 该矩阵提供了许多指标，可评估分类模型，不同的衡量标准包括：准确度、精确度、灵敏度(召回率)、特异性和F-score。
  - 准确度：衡量模型正确的频率，计算方法为(真阳性+真阴性)/总预测。
  - 精确度：在预测的积极因素中，真正积极的百分比，计算方法为真阳性/(真阳性+假阳性)。
  - 灵敏度：在所有的阳性病例中，预测阳性的百分比，计算方法为真阳性/(真阳性+假阴性)。
  - 特异性：类似于灵敏度，但从阴性结果的角度来看，计算方法为真阴性/(真阴性+假阳性)。
  - F-score：即精度和灵敏度的调和平均值，计算方法为2*((精度*灵敏度)/(精度+灵敏度))。

import numpy                                                 # 通过逻辑回归的预测来创建混淆矩阵
from sklearn import metrics
import matplotlib.pyplot as plt

actual = numpy.random.binomial(1, 0.9, size=1000)            # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)            # 生成预测值

confusion_matrix = metrics.confusion_matrix(actual, predic)  # 对实际值和预测值使用混淆矩阵函数
cm_display = metrics.ConfusionMatrixDisplay(                 # 将表格转换为混淆矩阵显示
    confusion_matrix=confusion_matrix,
    display_labels=[False, True]
)

cm_display.plot()                                            # 显示绘图
plt.show()

(1) 准确度

import numpy
from sklearn import metrics

actual = numpy.random.binomial(1, 0.9, size=1000)             # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)             # 生成预测值

Accuracy = metrics.accuracy_score(actual, predic)             # 准确度，衡量模型正确的频率
print(Accuracy)

(2) 精确度

import numpy
from sklearn import metrics

actual = numpy.random.binomial(1, 0.9, size=1000)             # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)             # 生成预测值

Precision = metrics.precision_score(actual, predic)           # 精确度，不评估正确预测的负面案例
print(Precision)

(3) 灵敏度

import numpy
from sklearn import metrics

actual = numpy.random.binomial(1, 0.9, size=1000)             # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)             # 生成预测值

Sensitivity_recall = metrics.recall_score(actual, predic)     # 灵敏度，衡量模型预测阳性的好坏程度
print(Sensitivity_recall)

(4) 特异性

import numpy
from sklearn import metrics

actual = numpy.random.binomial(1, 0.9, size=1000)             # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)             # 生成预测值

Specificity = metrics.recall_score(actual, predic, pos_label=0)
print(Specificity)                                            # 特异性，类似敏感性，从阴性结果角度看

(5) F-score

import numpy
from sklearn import metrics

actual = numpy.random.binomial(1, 0.9, size=1000)             # 生成实际值
predic = numpy.random.binomial(1, 0.9, size=1000)             # 生成预测值

F1_score = metrics.f1_score(actual, predic)                   # 同时考虑假阳性和假阴性
print(F1_score)                                               # 适用于不平衡的数据集，F-score

3-4 层次聚类

层次聚类
- 是一种用于聚类数据点的无监督学习方法，该算法通过测量数据之间的差异来构建集群。
- 无监督学习意味着模型不需要训练，可用于任何数据，以可视化和解释各个数据点之间的关系。
  - 首先将每个数据点视为其自己的集群，然后将它们之间距离最短的集群连接在一起，以创建更大的集群。
  - 重复此步骤，直到形成一个包含所有数据点的大集群，凝聚聚类(即一种遵循自下而上方法的层次聚类)。
- 在这里将使用层次聚类对数据点进行分组，并使用树状图和散点图可视化聚类。

import matplotlib.pyplot as plt

x = [4, 5, 10, 4, 3, 11, 14 , 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

plt.scatter(x, y)                                             # 可视化一些数据点
plt.show()

(1) 树状图可视化

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

x = [4, 5, 10, 4, 3, 11, 14, 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))

linkage_data = linkage(data, method="ward", metric="euclidean")
dendrogram(linkage_data)                                     # 使用欧几里得距离度量，并使用树状图将其可视化

plt.show()

(2) 散点图可视化

import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

x = [4, 5, 10, 4, 3, 11, 14, 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))

hierarchical_cluster = AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage="ward")
labels = hierarchical_cluster.fit_predict(data)              # AgglomerativeClustering()使用了scikit-learn库

plt.scatter(x, y, c=labels)
plt.show()

3-5 逻辑回归

逻辑回归
- 旨在解决分类问题，通过预测分类结果来做到这一点，与预测连续结果的线性回归不同。
  - 最简单的情况有两个结果，称为二项式，例如：预测肿瘤是恶性肿瘤还是良性肿瘤。
  - 两个以上结果的分类情况，称为多项式，例如：预测3种不同物种之间的鸢尾花类别。

import numpy
from sklearn import linear_model                             # 将自变量存储在X中，将因变量存储在y中

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1, 1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()                     # 创建逻辑回归对象，将独立值和相关值作为参数
logr.fit(X, y)                                               # 并用描述关系的数据填充回归对象

predicted = logr.predict(numpy.array([3.46]).reshape(-1, 1))
print(predicted)                                             # 结果为[0]，预测3.46mm的肿瘤不会癌变

(1) 系数

import numpy
from sklearn import linear_model

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1, 1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()                     # 创建逻辑回归对象，将独立值和相关值作为参数
logr.fit(X, y)                                               # 并用描述关系的数据填充回归对象

log_odds = logr.coef_                                        # 系数是X中每单位变化的结果的对数几率的预期变化
odds = numpy.exp(log_odds)                                   # 赔率，随着肿瘤大小增加1毫米，成为肿瘤的几率增加4倍

print(odds)

(2) 概率

import numpy
from sklearn import linear_model

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1, 1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()                     # 创建逻辑回归对象，将独立值和相关值作为参数
logr.fit(X, y)                                               # 并用描述关系的数据填充回归对象


def logit2prob(logr, X):
    log_odds = logr.coef_ * X + logr.intercept_              # 创建一个看起来类似于线性回归的公式，提取系数和截距
    odds = numpy.exp(log_odds)                               # 为了将对数赔率转换为赔率，必须对对数赔率取幂
    probability = odds / (1 + odds)                          # 有了赔率，可以将其除以1加上赔率，将其转换为概率
    return probability


print(logit2prob(logr, X))                                   # 找出每个肿瘤癌变的概率，3.78cm肿瘤癌变的概率为61%

3-6 网格搜索

网格搜索
- 穷举搜索：在所有候选参数的选择中，通过循环遍历尝试每一种可能性，表现最好的参数就是最终的结果。
- 大多数机器学习模型都包含可以调整以改变模型学习方式的参数，例如：逻辑回归模型。
  - C用于控制正则化参数，影响模型的复杂性，选择最佳值C则取决于训练模型的数据。
  - 一种方法是尝试不同的值，然后选择给出最佳分数的值，这种技术称之为网格搜索。
  - 如果必须为两个及两个以上参数选择值，将评估值集的所有组合，从而形成值网格。
- 使用训练数据对逻辑回归模型进行评分
  - 若模型与数据过于接近，可能无法很好地预测未知数据，这种统计误差被称为过度拟合。
  - 为了避免被训练的数据过度拟合导致的误差，可以保留一部分数据，专门用于测试模型。

(1) 默认参数

from sklearn import datasets                                 # 将加载用于对鸢尾花进行分类的逻辑模型
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()                                  # 加载将使用的数据集

X = iris["data"]                                             # 创建一组自变量X和一个因变量y
y = iris["target"]

logit = LogisticRegression(max_iter=10000)                   # 查看iris数据集并尝试在逻辑回归中训练不同值的模型

print(logit.fit(X, y))                                       # 使模型适合数据
print(logit.score(X, y))                                     # 运行评分方法，默认设置C为1，得分为0.973

(2) 设置范围

from sklearn import datasets                                 # 将加载用于对鸢尾花进行分类的逻辑模型
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()                                  # 加载将使用的数据集

X = iris["data"]                                             # 创建一组自变量X和一个因变量y
y = iris["target"]

logit = LogisticRegression(max_iter=10000)                   # 查看iris数据集并尝试在逻辑回归中训练不同值的模型

C = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]                 # 将C设置为一系列值
scores = []                                                  # 创建一个空列表来存储分数

for choice in C:                                             # for循环更改模型的C值并在每次更改时评估模型
    logit.set_params(C=choice)
    logit.fit(X, y)
    scores.append(logit.score(X, y))

print(scores)                                                # 通过存储在列表中的分数，评估最佳C值1.75左右

3-7 分类数据

分类数据
- 当数据由字符串表示类别时，很难使用它们来训练只接受数字数据的机器学习模型。
- 此时可以转换数据，以便在模型中使用，而不是忽略分类数据并从模型中排除信息。
- 热编码：用一列代表类别中的每个组，对于每一列，值将为1或0，1表示包含组，0表示排除组。

import pandas as pd                                           # data.csv改名为vehicle_info.csv

cars = pd.read_csv("data/vehicle_info.csv")                   # 在"多元线性回归"中预测CO2的排放量
print(cars.to_string())                                       # 排除了汽车品牌和型号

(1) 热编码

import pandas as pd                                           # data.csv改名为vehicle_info.csv

cars = pd.read_csv("data/vehicle_info.csv")
ohe_cars = pd.get_dummies(cars[["Car"]])                      # 执行一次热编码

print(ohe_cars.to_string())                                   # 为Car列中的每个汽车品牌创建了一个列

(2) 预测CO2

import pandas as pd                                          # data.csv改名为vehicle_info.csv
from sklearn import linear_model                             # 创建一个线性模型

cars = pd.read_csv("data/vehicle_info.csv")
ohe_cars = pd.get_dummies(cars[["Car"]])                     # 执行一次热编码，创建虚拟变量

X = pd.concat([cars[["Volume", "Weight"]], ohe_cars], axis=1)
y = cars["CO2"]                                              # 选择自变量X并按列添加虚拟变量，将因变量存储在y中

regr = linear_model.LinearRegression()                       # 将数据拟合为线性回归
regr.fit(X.values, y.values)

predictedCO2 = regr.predict([[2300, 1300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]])
print(predictedCO2)                                          # 根据汽车的重量、体积和制造商来预测CO2排放量

(3) Dummifying

import pandas as pd

colors1 = pd.DataFrame({"color": ["blue", "pink"]})
print(colors1)                                               # 有一个表示颜色的列，2个组
print("--------------------------------")

dummies = pd.get_dummies(colors1, drop_first=True)
print(dummies)                                               # 使用用于一次热编码的函数，然后删除其中一列
print("--------------------------------")

colors2 = pd.DataFrame({"color": ["blue", "pink", "grey"]})
print(colors2)                                               # 有一个表示颜色的列，3个组
print("--------------------------------")

dummies = pd.get_dummies(colors2, drop_first=True)
dummies["color2"] = colors2["color"]
print(dummies)                                               # 使用用于一次热编码的函数，然后删除其中一列

3-8 K-最近邻

K-最近邻
- K-Nearest Neighbor，KNN，即一种分类(Classification)算法，输入基于实例的学习，属于懒惰学习。
- 即没有训练阶段，数据集事先已有了分类和特征值，待收到新样本后直接进行处理，与急切学习相对应。
- 思想：若一样本在特征空间中的k个最邻近的样本中的大多数属于某一个类别，则该样本也划分为该类别。

import matplotlib.pyplot as plt

x = [4, 5, 10, 4, 3, 11, 14, 8, 10, 12]                       # 两个输入特征x和y
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]                      # 一个目标类classes

plt.scatter(x, y, c=classes)
plt.show()                                                    # 可视化一些数据点

(1) K=1拟合KNN

# Three lines to make our compiler able to draw
# import sys
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from warnings import simplefilter                            # 忽视所有的警告消息
from sklearn.neighbors import KNeighborsClassifier
simplefilter(action="ignore", category=FutureWarning)

x = [4, 5, 10, 4, 3, 11, 14, 8, 10, 12]                      # 两个输入特征x和y
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]                     # 一个目标类classes

data = list(zip(x, y))                                       # 将输入特征转换为一组点
knn = KNeighborsClassifier(n_neighbors=1)                    # K=1拟合KNN算法

knn.fit(data, classes)

new_x = 8                                                    # 创建新的x和y特征
new_y = 21
new_point = [(new_x, new_y)]

prediction = knn.predict(new_point)                          # 调用knn.predict()分类一个新的数据点

plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])
plt.text(x=new_x - 1.7, y=new_y - 0.7, s=f"new point, class: {prediction[0]}")
plt.show()                                                   # 可视化一些数据点，text()突出新点的位置

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

(2) K=5拟合KNN

# Three lines to make our compiler able to draw
# import sys
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from warnings import simplefilter                            # 忽视所有的警告消息
from sklearn.neighbors import KNeighborsClassifier
simplefilter(action="ignore", category=FutureWarning)

x = [4, 5, 10, 4, 3, 11, 14, 8, 10, 12]                      # 两个输入特征x和y
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]
classes = [0, 0, 1, 0, 0, 1, 1, 0, 1, 1]                     # 一个目标类classes

data = list(zip(x, y))                                       # 将输入特征转换为一组点
knn = KNeighborsClassifier(n_neighbors=5)                    # K=5拟合KNN算法，更高的K值来改变预测

knn.fit(data, classes)

new_x = 8                                                    # 创建新的x和y特征
new_y = 21
new_point = [(new_x, new_y)]

prediction = knn.predict(new_point)                          # 调用knn.predict()分类一个新的数据点

plt.scatter(x + [new_x], y + [new_y], c=classes + [prediction[0]])
plt.text(x=new_x - 1.7, y=new_y - 0.7, s=f"new point, class: {prediction[0]}")
plt.show()                                                   # 可视化一些数据点，text()突出新点的位置

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

4 Bootstrap聚合

Bootstrap聚合
- 也叫Bagging算法或引导聚集算法，又称装袋算法，试图解决分类或回归的过度拟合问题。
  - 主要想法是分别训练几个不同的模型，然后让所有的模型表决测试样例的输出。
  - 是机器学习中常规策略的一个例子，被称为模型平均，该技术被称为集成方法。
- 了解装袋提高模型性能的方法，必须先评估分类器在数据集上的表现，装袋是决策树概念的延续。

4-1 分类器评估

from sklearn import datasets                                 # 通过Sklearn的葡萄酒数据集发现不同类别的葡萄酒
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

data = datasets.load_wine(as_frame=True)                     # as_frame设为True，在加载数据时不会丢失特征名称

X = data.data                                                # 将其存储到X(输入特征)和y(目标)中
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

dtree = DecisionTreeClassifier(random_state=22)
dtree.fit(X_train, y_train)                                  # 实例化一个基础分类器并将其拟合到训练数据中

y_pred = dtree.predict(X_test)                               # 预测未见过测试集的葡萄酒类别并评估模型性能

print("Train data accuracy:", accuracy_score(y_true=y_train, y_pred=dtree.predict(X_train)))
print("Test data accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred))

4-2 装袋分类器

from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

data = datasets.load_wine(as_frame=True)

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

estimator_range = [2, 4, 6, 8, 10, 12, 14, 16]               # 创建一个值范围代表想要在每个集成中使用的估计数量

models = []
scores = []

for n_estimators in estimator_range:                         # for循环将模型和分数存储在单独的列表中供后续可视化
    # Create bagging classifier
    clf = BaggingClassifier(n_estimators=n_estimators, random_state=22)
    # Fit the model
    clf.fit(X_train, y_train)

    # Append the model and score to their respective list
    models.append(clf)
    scores.append(accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))

# Generate the plot of scores against number of estimators
plt.figure(figsize=(9, 6))
plt.plot(estimator_range, scores)

# Adjust labels and font (to make visable)
plt.xlabel("n_estimators", fontsize=18)                      # 通过迭代估计器数量的不同值
plt.ylabel("score", fontsize=18)                             # 可看到模型性能从82.2%提高到95.5%
plt.tick_params(labelsize=16)                                # 14后精度开始下降

# Visualize plot                                             # 如果设置不同的random_state值，结果也会不同
plt.show()

4-3 另一种形式

from sklearn import datasets
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

data = datasets.load_wine(as_frame=True)

X = data.data                                                # 袋外估计可能会高估二元分类问题中的错误
y = data.target                                              # 因此只能用作对其他指标的补充

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

oob_model = BaggingClassifier(n_estimators=12, oob_score=True, random_state=22)
oob_model.fit(X_train, y_train)                              # 使用袋外指标创建模型，类似于测试集

# OOB和测试集使用的样本不同，并且数据集比较小，所以在准确率上存在差异
print(oob_model.oob_score_)

4-4 生成决策树

# import sys
import graphviz
# import matplotlib
# matplotlib.use("Agg")
from sklearn import tree
from sklearn import datasets
# from sklearn.tree import plot_tree
from matplotlib import pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

data = datasets.load_wine(as_frame=True)

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

oob_model = BaggingClassifier(n_estimators=12, oob_score=True, random_state=22)
oob_model.fit(X_train, y_train)

clf = BaggingClassifier(n_estimators=12, oob_score=True, random_state=22)
clf.fit(X_train, y_train)

plt.figure(figsize=(30, 20))
dtree = clf.estimators_[0]                                   # clf.estimators_是3个拟合决策树的列表
print(dtree)                                                 # 可以遍历列表访问每个树，这里取第一个

# plot_tree(clf.estimators_[0], feature_names = X.columns)
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

dot_data = tree.export_graphviz(dtree, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("data/visual_dtree")                            # 生成visual_dtree.pdf可视化决策树

5 交叉验证(建模)

交叉验证(建模)
- 在机器学习建立模型和验证模型参数时常用的办法，一般被用于评估一个机器学习模型的表现。
- 重复使用数据，切分得到的样本，组合为不同的训练集和测试集，训练及评估模型预测的好坏。
- 优化测试集的参数可能会导致信息泄漏，导致模型在看不见的数据上表现更差，为纠正该问题，执行交叉验证。
- K-Fold
  - 首先，模型中使用的训练数据被分成了k个较小的集合，用于验证模型。
  - 然后，在k-1倍的训练集上训练模型，剩余的集合用作验证集评估模型。
- 分层K-Fold
  - 在类别不平衡的情况下，需要一种方法来解释训练集和验证集的不平衡。
  - 对目标类别进行分层，意味着这两个集合在所有类别中所占的比例相等。
- 留一法，Leave-One-Out，简称LOO。
  - 使k等于数据集中数据的个数，每次只使用一个作为测试集，剩下的全作为训练集。
  - 这种方法得出的结果，与训练整个测试集的期望值最为接近，但是成本也十分庞大。
- 留P法，Leave-P-Out，简称LPO。
  - 使用样本中的某几项当做测试集，再从样本中选取某几项的可能种类称为P值。
  - Leave-P-Out可迅速提高模型的精确度，准确的描摹大样本数据集的特征信息。
- 蒙特卡罗交叉验证，也称为Shuffle Split交叉验证。
  - 一种非常灵活的交叉验证策略，在这种技术中，数据集被随机划分为训练集和验证集。
  - 假设100个样本，60%的样本用作训练集，20%用作测试集，剩下的20%将不被使用。

5-1 K-Fold

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier              # 尝试对不同种类的鸢尾花进行分类
from sklearn.model_selection import KFold, cross_val_score

X, y = datasets.load_iris(return_X_y=True)                   # 加载数据

clf = DecisionTreeClassifier(random_state=42)                # 创建并拟合模型以进行评估

k_folds = KFold(n_splits=5)                                  # 评估模型，看在每个k-fold上的表现

scores = cross_val_score(clf, X, y, cv=k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

5-2 分层K-Fold

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier              # 尝试对不同种类的鸢尾花进行分类
from sklearn.model_selection import StratifiedKFold, cross_val_score

X, y = datasets.load_iris(return_X_y=True)                   # 加载数据

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits=5)                       # 折叠数保持相同

scores = cross_val_score(clf, X, y, cv=sk_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())                   # 在确保存在分层类别时，平均CV从基本k倍增加
print("Number of CV Scores used in Average: ", len(scores))

5-3 留一法(LOO)

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier              # 尝试对不同种类的鸢尾花进行分类
from sklearn.model_selection import LeaveOneOut, cross_val_score

X, y = datasets.load_iris(return_X_y=True)                   # 加载数据

clf = DecisionTreeClassifier(random_state=42)                # 创建并拟合模型以进行评估

loo = LeaveOneOut()

scores = cross_val_score(clf, X, y, cv=loo)

print("Cross Validation Scores: ", scores)                   # 执行交叉验证分数的数量等于数据集中观察的数量
print("Average CV Score: ", scores.mean())                   # 鸢尾花数据集中有150个观测值，平均CV得分为94%
print("Number of CV Scores used in Average: ", len(scores))

5-4 留P验证(LPO)

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier              # 尝试对不同种类的鸢尾花进行分类
from sklearn.model_selection import LeavePOut, cross_val_score

X, y = datasets.load_iris(return_X_y=True)                   # 加载数据

clf = DecisionTreeClassifier(random_state=42)                # 创建并拟合模型以进行评估

lpo = LeavePOut(p=2)

scores = cross_val_score(clf, X, y, cv=lpo)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())                   # 观测值数量足够多，实现了大致相同的平均CV分数
print("Number of CV Scores used in Average: ", len(scores))

5-5 蒙特卡罗验证

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier              # 尝试对不同种类的鸢尾花进行分类
from sklearn.model_selection import ShuffleSplit, cross_val_score

X, y = datasets.load_iris(return_X_y=True)                   # 加载数据

clf = DecisionTreeClassifier(random_state=42)                # 创建并拟合模型以进行评估

ss = ShuffleSplit(train_size=0.6, test_size=0.3, n_splits=5)

scores = cross_val_score(clf, X, y, cv=ss)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

6 AUC-ROC曲线

AUC-ROC曲线
- 在不同阈值设置的条件下，分类问题的性能度量，ROC指概率曲线，AUC指正负类可正确分类的程度。
  - 告诉模型能够在多大程度上区分类，AUC越高，模型越能预测0为0和1为1。
  - 类比疾病诊断模型，若AUC越高，模型对有疾病和无疾病的区分就会越好。
- 术语定义：真阳性(TP)、假阳性(FP)。
  - 特异度(Specificity)=TN/(TN+FP)
  - 假阳率(FPR)=1-Specificity=FP/(TN+FP)
  - 真阳率(TPR)/召回率(Recall)/敏感度(Sensitivity)=TP/(TP+FN)

6-1 不平衡数据

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

n = 10000                                                    # 假设有一个不平衡的数据集
ratio = .95                                                  # 其中大部分数据都是一个值
n_0 = int((1-ratio) * n)                                     # 可通过预测多数类来获得模型的高精度
n_1 = int(ratio * n)

y = np.array([0] * n_0 + [1] * n_1)
y_proba = np.array([1]*n)
y_pred = y_proba > .5

print(f"accuracy score: {accuracy_score(y, y_pred)}")
cf_mat = confusion_matrix(y, y_pred)
print("Confusion matrix")
print(cf_mat)
print(f"class 0 accuracy: {cf_mat[0][0]/n_0}")
print(f"class 1 accuracy: {cf_mat[1][1]/n_1}")

6-2 准确性评估

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

n = 10000                                                    # 假设有一个不平衡的数据集
ratio = .95                                                  # 其中大部分数据都是一个值
n_0 = int((1-ratio) * n)                                     # 可通过预测多数类来获得模型的高精度
n_1 = int(ratio * n)

y = np.array([0] * n_0 + [1] * n_1)
y_proba_1 = np.array([1]*n)
y_pred_1 = y_proba_1 > .5

print(f"accuracy score: {accuracy_score(y, y_pred_1)}")
cf_mat = confusion_matrix(y, y_pred_1)
print("Confusion matrix")                                    # 虽获得了高准确性，但该模型没有提供有关数据的信息
print(cf_mat)                                                # 在100%的时间内准确地预测了1类
print(f"class 0 accuracy: {cf_mat[0][0]/n_0}")               # 在0%的时间内并未准确地预测到类
print(f"class 1 accuracy: {cf_mat[1][1]/n_1}")               # 以牺牲准确性为代价
print("----------------------")                              # 拥有一个可在某程度上区分这两类的模型可能更好

# below are the probabilities obtained from a hypothetical model that doesn"t always predict the mode
y_proba_2 = np.array(np.random.uniform(0, .7, n_0).tolist() + np.random.uniform(.3, 1, n_1).tolist())
y_pred_2 = y_proba_2 > .5

print(f"accuracy score: {accuracy_score(y, y_pred_2)}")
cf_mat = confusion_matrix(y, y_pred_2)
print("Confusion matrix")                                    # 第二组预测准确度分数没有第一组高
print(cf_mat)                                                # 但每个类别的准确度更加平衡
print(f"class 0 accuracy: {cf_mat[0][0]/n_0}")               # 准确性作为评估指标会使第一个模型评分高于第二个模型
print(f"class 1 accuracy: {cf_mat[1][1]/n_1}")               # 在这种情况下，最好使用其他评估指标，例如AUC

(1) 第一个模型

# Three lines to make our compiler able to draw
# import sys
import numpy as np
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

n = 10000                                                    # 假设有一个不平衡的数据集
ratio = .95                                                  # 其中大部分数据都是一个值
n_0 = int((1-ratio) * n)                                     # 可通过预测多数类来获得模型的高精度
n_1 = int(ratio * n)

y = np.array([0] * n_0 + [1] * n_1)
y_proba_1 = np.array([1] * n)
y_pred_1 = y_proba_1 > .5


def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()                                               # AUC分数约为0.5，意味着该模型无法区分两个类别


plot_roc_curve(y, y_proba_1)                                 # 曲线看起来像一条斜率为1的线
print(f"model 1 AUC score: {roc_auc_score(y, y_proba_1)}")

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

(2) 第二个模型

# Three lines to make our compiler able to draw
# import sys
import numpy as np
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

n = 10000                                                    # 假设有一个不平衡的数据集
ratio = .95                                                  # 其中大部分数据都是一个值
n_0 = int((1-ratio) * n)                                     # 可通过预测多数类来获得模型的高精度
n_1 = int(ratio * n)

y = np.array([0] * n_0 + [1] * n_1)

# below are the probabilities obtained from a hypothetical model that doesn"t always predict the mode
y_proba_2 = np.array(np.random.uniform(0, .7, n_0).tolist() + np.random.uniform(.3, 1, n_1).tolist())
y_pred_2 = y_proba_2 > .5


def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()                                               # AUC分数接近1，意味着该模型有能力将两个类分开


plot_roc_curve(y, y_proba_2)                                 # 曲线将更靠近图表的左上角
print(f"model 2 AUC score: {roc_auc_score(y, y_proba_2)}")

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

6-3 类别预测概率

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score

n = 10000
y = np.array([0] * n + [1] * n)

y_prob_1 = np.array(                                         # 有两组来自假设模型的概率
    np.random.uniform(.25, .5, n//2).tolist()                # 第一个在预测两个类别时，概率接近.5
    + np.random.uniform(.3, .7, n).tolist()                  # 第二个在预测两个类别时，概率接近0或1的极端值
    + np.random.uniform(.5, .75, n//2).tolist()
)
y_prob_2 = np.array(
    np.random.uniform(0, .4, n//2).tolist() +
    np.random.uniform(.3, .7, n).tolist() +
    np.random.uniform(.6, 1, n//2).tolist()
)

print(f"model 1 accuracy score: {accuracy_score(y, y_prob_1>.5)}")
print(f"model 2 accuracy score: {accuracy_score(y, y_prob_2>.5)}")

print(f"model 1 AUC score: {roc_auc_score(y, y_prob_1)}")
print(f"model 2 AUC score: {roc_auc_score(y, y_prob_2)}")

(1) 第一个模型

# import sys
import numpy as np
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

n = 10000
y = np.array([0] * n + [1] * n)

y_prob_1 = np.array(                                         # 有两组来自假设模型的概率
    np.random.uniform(.25, .5, n//2).tolist()                # 第一个在预测两个类别时，概率接近.5
    + np.random.uniform(.3, .7, n).tolist()                  # 第二个在预测两个类别时，概率接近0或1的极端值
    + np.random.uniform(.5, .75, n//2).tolist()
)
y_prob_2 = np.array(
    np.random.uniform(0, .4, n // 2).tolist() +
    np.random.uniform(.3, .7, n).tolist() +
    np.random.uniform(.6, 1, n // 2).tolist()
)


def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()


plot_roc_curve(y, y_prob_1)

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

(2) 第二个模型

# import sys
import numpy as np
# import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

n = 10000
y = np.array([0] * n + [1] * n)

y_prob_1 = np.array(                                         # 有两组来自假设模型的概率
    np.random.uniform(.25, .5, n//2).tolist()                # 第一个在预测两个类别时，概率接近.5
    + np.random.uniform(.3, .7, n).tolist()                  # 第二个在预测两个类别时，概率接近0或1的极端值
    + np.random.uniform(.5, .75, n//2).tolist()
)
y_prob_2 = np.array(
    np.random.uniform(0, .4, n // 2).tolist() +
    np.random.uniform(.3, .7, n).tolist() +
    np.random.uniform(.6, 1, n // 2).tolist()
)


def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """
    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    # plt.xlabel("False Positive Rate")
    # plt.ylabel("True Positive Rate")


fpr, tpr, thresholds = roc_curve(y, y_prob_2)
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")                            # 尽管两模型的准确度相似，但AUC分数高的模型更可靠
plt.ylabel("True Positive Rate")                             # 考虑了预测概率，预测未来数据时更可能提供高准确性
plt.show()

# Two lines to make our compiler able to draw
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()

Python

#机器学习 #人工智能 #模型训练

Python 机器学习

https://stitch-top.github.io/2021/09/04/python/python10-python-ji-qi-xue-xi/

作者

Dr.626

发布于

2021年9月4日 22:56:36

许可协议

Python MySQL 上一篇

Python Matplotlib 下一篇