Commit c217bd56 by Leo

upload code

parent f27d42c3
-- "a/\347\233\264\346\222\255/1-\347\254\254\344\270\200\351\230\266\346\256\265/1.4-AI+\347\247\221\347\240\224\345\210\233\346\226\260-\347\224\237\345\214\226\347\216\257\346\235\220\345\255\246\347\247\221\347\232\204\346\225\260\346\215\256\351\233\206\345\244\204\347\220\206/.gitkeep" ++ /dev/null
{ ++ /dev/null
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": " DOI 分离物质名称 水 甲醇 乙腈 pH \\\n0 3种新型α_1-受体阻断剂...性流动相HPLC分离与制备_牛长群 S-Alfuzosin 92.5 0 7.5 5.6 \n1 3种新型α_2-受体阻断剂...性流动相HPLC分离与制备_牛长群 R-Alfuzosin 92.5 0 7.5 5.6 \n2 3种新型α_3-受体阻断剂...性流动相HPLC分离与制备_牛长群 S-Terazosin 97.0 0 3.0 6.0 \n3 3种新型α_4-受体阻断剂...性流动相HPLC分离与制备_牛长群 R-Terazosin 97.0 0 3.0 6.0 \n4 3种新型α_5-受体阻断剂...性流动相HPLC分离与制备_牛长群 S-Doxazosin 80.0 0 20.0 5.8 \n.. ... ... ... .. ... ... \n253 NaN S-Venlafaxine 85.0 15 0.0 5.0 \n254 NaN R-Metoprolol 85.0 15 0.0 5.0 \n255 NaN S-Metoprolol 85.0 15 0.0 5.0 \n256 NaN R-Venlafaxine 85.0 15 0.0 5.0 \n257 NaN S-Venlafaxine 85.0 15 0.0 5.0 \n\n 流速 柱温 手性添加剂 添加剂用量 色谱柱 柱长 保留时间 \n0 1.0 NaN CM-B-CD 19.5 mmol/L C4 NaN 42.80 \n1 1.0 NaN CM-B-CD 19.5 mmol/L C4 NaN 47.40 \n2 1.0 NaN CM-B-CD 32.4 mmol/L C4 NaN 90.00 \n3 1.0 NaN CM-B-CD 32.4 mmol/L C4 NaN 97.60 \n4 1.0 NaN CM-B-CD 13 mmol/L C4 NaN 47.10 \n.. ... ... ... ... ... ... ... \n253 0.5 30 CM-B-CD 20 mmol/L C18 150×4.6 32.39 \n254 0.5 30 CM-B-CD 40 mmol/L C18 150×4.6 13.65 \n255 0.5 30 CM-B-CD 40 mmol/L C18 150×4.6 14.61 \n256 0.5 30 CM-B-CD 40 mmol/L C18 150×4.6 26.33 \n257 0.5 30 CM-B-CD 40 mmol/L C18 150×4.6 29.22 \n\n[258 rows x 13 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>DOI</th>\n <th>分离物质名称</th>\n <th>水</th>\n <th>甲醇</th>\n <th>乙腈</th>\n <th>pH</th>\n <th>流速</th>\n <th>柱温</th>\n <th>手性添加剂</th>\n <th>添加剂用量</th>\n <th>色谱柱</th>\n <th>柱长</th>\n <th>保留时间</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>3种新型α_1-受体阻断剂...性流动相HPLC分离与制备_牛长群</td>\n <td>S-Alfuzosin</td>\n <td>92.5</td>\n <td>0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>CM-B-CD</td>\n <td>19.5 mmol/L</td>\n <td>C4</td>\n <td>NaN</td>\n <td>42.80</td>\n </tr>\n <tr>\n <th>1</th>\n <td>3种新型α_2-受体阻断剂...性流动相HPLC分离与制备_牛长群</td>\n <td>R-Alfuzosin</td>\n <td>92.5</td>\n <td>0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>CM-B-CD</td>\n <td>19.5 mmol/L</td>\n <td>C4</td>\n <td>NaN</td>\n <td>47.40</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3种新型α_3-受体阻断剂...性流动相HPLC分离与制备_牛长群</td>\n <td>S-Terazosin</td>\n <td>97.0</td>\n <td>0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>CM-B-CD</td>\n <td>32.4 mmol/L</td>\n <td>C4</td>\n <td>NaN</td>\n <td>90.00</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3种新型α_4-受体阻断剂...性流动相HPLC分离与制备_牛长群</td>\n <td>R-Terazosin</td>\n <td>97.0</td>\n <td>0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>CM-B-CD</td>\n <td>32.4 mmol/L</td>\n <td>C4</td>\n <td>NaN</td>\n <td>97.60</td>\n </tr>\n <tr>\n <th>4</th>\n <td>3种新型α_5-受体阻断剂...性流动相HPLC分离与制备_牛长群</td>\n <td>S-Doxazosin</td>\n <td>80.0</td>\n <td>0</td>\n <td>20.0</td>\n <td>5.8</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>CM-B-CD</td>\n <td>13 mmol/L</td>\n <td>C4</td>\n <td>NaN</td>\n <td>47.10</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>253</th>\n <td>NaN</td>\n <td>S-Venlafaxine</td>\n <td>85.0</td>\n <td>15</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-B-CD</td>\n <td>20 mmol/L</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>32.39</td>\n </tr>\n <tr>\n <th>254</th>\n <td>NaN</td>\n <td>R-Metoprolol</td>\n <td>85.0</td>\n <td>15</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-B-CD</td>\n <td>40 mmol/L</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>13.65</td>\n </tr>\n <tr>\n <th>255</th>\n <td>NaN</td>\n <td>S-Metoprolol</td>\n <td>85.0</td>\n <td>15</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-B-CD</td>\n <td>40 mmol/L</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>14.61</td>\n </tr>\n <tr>\n <th>256</th>\n <td>NaN</td>\n <td>R-Venlafaxine</td>\n <td>85.0</td>\n <td>15</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-B-CD</td>\n <td>40 mmol/L</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>26.33</td>\n </tr>\n <tr>\n <th>257</th>\n <td>NaN</td>\n <td>S-Venlafaxine</td>\n <td>85.0</td>\n <td>15</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-B-CD</td>\n <td>40 mmol/L</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>29.22</td>\n </tr>\n </tbody>\n</table>\n<p>258 rows × 13 columns</p>\n</div>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#数据处理\n",
"data=pd.read_excel(r'dataset.xlsx')\n",
"data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": " 序号 分离物质名称 SMILES 水 甲醇 乙腈 pH 流速 柱温 手性添加剂 添加剂用量 \\\n0 1.0 S-Alfuzosin NaN 92.5 0.0 7.5 5.6 1 25 CM-β-CD NaN \n1 2.0 R-Alfuzosin NaN 92.5 0.0 7.5 5.6 1 25 CM-β-CD NaN \n2 3.0 S-Terazosin NaN 97.0 0.0 3.0 6.0 1 25 CM-β-CD NaN \n3 4.0 R-Terazosin NaN 97.0 0.0 3.0 6.0 1 25 CM-β-CD NaN \n4 5.0 S-Doxazosin NaN 80.0 0.0 20.0 5.8 1 25 CM-β-CD NaN \n.. ... ... ... ... ... ... ... ... .. ... ... \n712 NaN S-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 20 \n713 NaN R-美托洛尔 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n714 NaN S-美托洛尔 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n715 NaN R-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n716 NaN S-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n\n 色谱柱 柱长 保留时间 分离因子 备注 column_y column_x \n0 C4 250×4.6 42.8 1.1 NaN 250.0 4.6 \n1 C4 250×4.6 47.4 NaN NaN 250.0 4.6 \n2 C4 250×4.6 90 1.1 NaN 250.0 4.6 \n3 C4 250×4.6 97.6 NaN NaN 250.0 4.6 \n4 C4 250×4.6 47.1 1.2 NaN 250.0 4.6 \n.. ... ... ... ... ... ... ... \n712 C18 150×4.6 32.39 NaN NaN 150.0 4.6 \n713 C18 150×4.6 13.65 NaN NaN 150.0 4.6 \n714 C18 150×4.6 14.61 NaN NaN 150.0 4.6 \n715 C18 150×4.6 26.33 NaN NaN 150.0 4.6 \n716 C18 150×4.6 29.22 NaN NaN 150.0 4.6 \n\n[717 rows x 18 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>序号</th>\n <th>分离物质名称</th>\n <th>SMILES</th>\n <th>水</th>\n <th>甲醇</th>\n <th>乙腈</th>\n <th>pH</th>\n <th>流速</th>\n <th>柱温</th>\n <th>手性添加剂</th>\n <th>添加剂用量</th>\n <th>色谱柱</th>\n <th>柱长</th>\n <th>保留时间</th>\n <th>分离因子</th>\n <th>备注</th>\n <th>column_y</th>\n <th>column_x</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.0</td>\n <td>S-Alfuzosin</td>\n <td>NaN</td>\n <td>92.5</td>\n <td>0.0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>42.8</td>\n <td>1.1</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2.0</td>\n <td>R-Alfuzosin</td>\n <td>NaN</td>\n <td>92.5</td>\n <td>0.0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>47.4</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3.0</td>\n <td>S-Terazosin</td>\n <td>NaN</td>\n <td>97.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>90</td>\n <td>1.1</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4.0</td>\n <td>R-Terazosin</td>\n <td>NaN</td>\n <td>97.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>97.6</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5.0</td>\n <td>S-Doxazosin</td>\n <td>NaN</td>\n <td>80.0</td>\n <td>0.0</td>\n <td>20.0</td>\n <td>5.8</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>47.1</td>\n <td>1.2</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>712</th>\n <td>NaN</td>\n <td>S-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>20</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>32.39</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>713</th>\n <td>NaN</td>\n <td>R-美托洛尔</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>13.65</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>714</th>\n <td>NaN</td>\n <td>S-美托洛尔</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>14.61</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>715</th>\n <td>NaN</td>\n <td>R-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>26.33</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>716</th>\n <td>NaN</td>\n <td>S-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>29.22</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n </tbody>\n</table>\n<p>717 rows × 18 columns</p>\n</div>"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data.loc[:,\"保留时间\"] = data[\"保留时间\"].fillna(0)\n",
"data.loc[:,\"pH\"] = data[\"pH\"].fillna(7)\n",
"data.loc[:,\"柱温\"] = data[\"柱温\"].fillna(25)\n",
"data.loc[:,\"柱长\"] = data[\"柱长\"].fillna(\"250×4.6\")\n",
"data.loc[:,\"流速\"] = data[\"流速\"].fillna(1)\n",
"# data = data.drop([\"差值\",\"序号\",\"分离因子\",\"DOI\"],axis=1)\n",
"# data = data.drop([\"SMILES\",\"序号\",\"DOI\"],axis=1)\n",
"data = data.drop([\"DOI\"],axis=1)\n",
"data.loc[:,\"色谱柱\"] = data[\"色谱柱\"].fillna(\"C18\")\n",
"data[['column_y','column_x']] = data['柱长'].str.split('×',expand = True).astype(float)\n",
"\n",
"data.replace({\"柱温\":{\"室温\":25}},inplace = True)\n",
"\n",
"data[\"添加剂用量\"]=data[\"添加剂用量\"].str.rstrip(\"mmol/L\")\n",
"# data[\"添加剂用量\"].astype(float)\n",
"data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " 序号 分离物质名称 SMILES 水 甲醇 乙腈 pH 流速 柱温 手性添加剂 添加剂用量 \\\n0 1.0 S-Alfuzosin NaN 92.5 0.0 7.5 5.6 1 25 CM-β-CD NaN \n1 2.0 R-Alfuzosin NaN 92.5 0.0 7.5 5.6 1 25 CM-β-CD NaN \n2 3.0 S-Terazosin NaN 97.0 0.0 3.0 6.0 1 25 CM-β-CD NaN \n3 4.0 R-Terazosin NaN 97.0 0.0 3.0 6.0 1 25 CM-β-CD NaN \n4 5.0 S-Doxazosin NaN 80.0 0.0 20.0 5.8 1 25 CM-β-CD NaN \n.. ... ... ... ... ... ... ... ... .. ... ... \n712 NaN S-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 20 \n713 NaN R-美托洛尔 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n714 NaN S-美托洛尔 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n715 NaN R-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n716 NaN S-文拉法辛 NaN 85.0 15.0 0.0 5.0 0.5 30 CM-β-CD 40 \n\n 色谱柱 柱长 保留时间 分离因子 备注 column_y column_x \n0 C4 250×4.6 42.8 1.1 NaN 250.0 4.6 \n1 C4 250×4.6 47.4 NaN NaN 250.0 4.6 \n2 C4 250×4.6 90 1.1 NaN 250.0 4.6 \n3 C4 250×4.6 97.6 NaN NaN 250.0 4.6 \n4 C4 250×4.6 47.1 1.2 NaN 250.0 4.6 \n.. ... ... ... ... ... ... ... \n712 C18 150×4.6 32.39 NaN NaN 150.0 4.6 \n713 C18 150×4.6 13.65 NaN NaN 150.0 4.6 \n714 C18 150×4.6 14.61 NaN NaN 150.0 4.6 \n715 C18 150×4.6 26.33 NaN NaN 150.0 4.6 \n716 C18 150×4.6 29.22 NaN NaN 150.0 4.6 \n\n[666 rows x 18 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>序号</th>\n <th>分离物质名称</th>\n <th>SMILES</th>\n <th>水</th>\n <th>甲醇</th>\n <th>乙腈</th>\n <th>pH</th>\n <th>流速</th>\n <th>柱温</th>\n <th>手性添加剂</th>\n <th>添加剂用量</th>\n <th>色谱柱</th>\n <th>柱长</th>\n <th>保留时间</th>\n <th>分离因子</th>\n <th>备注</th>\n <th>column_y</th>\n <th>column_x</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.0</td>\n <td>S-Alfuzosin</td>\n <td>NaN</td>\n <td>92.5</td>\n <td>0.0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>42.8</td>\n <td>1.1</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2.0</td>\n <td>R-Alfuzosin</td>\n <td>NaN</td>\n <td>92.5</td>\n <td>0.0</td>\n <td>7.5</td>\n <td>5.6</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>47.4</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3.0</td>\n <td>S-Terazosin</td>\n <td>NaN</td>\n <td>97.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>90</td>\n <td>1.1</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4.0</td>\n <td>R-Terazosin</td>\n <td>NaN</td>\n <td>97.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>97.6</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5.0</td>\n <td>S-Doxazosin</td>\n <td>NaN</td>\n <td>80.0</td>\n <td>0.0</td>\n <td>20.0</td>\n <td>5.8</td>\n <td>1</td>\n <td>25</td>\n <td>CM-β-CD</td>\n <td>NaN</td>\n <td>C4</td>\n <td>250×4.6</td>\n <td>47.1</td>\n <td>1.2</td>\n <td>NaN</td>\n <td>250.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>712</th>\n <td>NaN</td>\n <td>S-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>20</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>32.39</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>713</th>\n <td>NaN</td>\n <td>R-美托洛尔</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>13.65</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>714</th>\n <td>NaN</td>\n <td>S-美托洛尔</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>14.61</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>715</th>\n <td>NaN</td>\n <td>R-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>26.33</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>716</th>\n <td>NaN</td>\n <td>S-文拉法辛</td>\n <td>NaN</td>\n <td>85.0</td>\n <td>15.0</td>\n <td>0.0</td>\n <td>5.0</td>\n <td>0.5</td>\n <td>30</td>\n <td>CM-β-CD</td>\n <td>40</td>\n <td>C18</td>\n <td>150×4.6</td>\n <td>29.22</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>150.0</td>\n <td>4.6</td>\n </tr>\n </tbody>\n</table>\n<p>666 rows × 18 columns</p>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_clean = data.dropna(subset=[\"保留时间\"])\n",
"data_clean"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [],
"source": [
"# 流动相溶剂介电常数\n",
"a_Dielectricz_Constants = 80\n",
"b_Dielectricz_Constants = 33.6\n",
"c_Dielectricz_Constants = 37.5"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [],
"source": [
"# 标准化\n",
"data[\"水\"] = data[\"水\"]/100\n",
"data[\"甲醇\"] = data[\"甲醇\"]/100\n",
"data[\"乙腈\"] = data[\"乙腈\"]/100"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [],
"source": [
"# 估计流动相混合溶液的极性\n",
"data[\"solvent_polar\"] = data[\"水\"]*a_Dielectricz_Constants + data[\"甲醇\"]* b_Dielectricz_Constants + data[\"乙腈\"]*c_Dielectricz_Constants"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 43,
"outputs": [],
"source": [
"data[\"solvent_polar\"]\n",
"data = data.drop([\"水\",'甲醇','乙腈','柱长'],axis=1)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [
{
"data": {
"text/plain": " 序号 分离物质名称 pH 流速 柱温 手性添加剂 添加剂用量 色谱柱 保留时间 column_y \\\n0 1.0 S-Alfuzosin 5.6 1.0 25 CM-B-CD 19.5 C4 42.8 250.0 \n1 2.0 R-Alfuzosin 5.6 1.0 25 CM-B-CD 19.5 C4 47.4 250.0 \n2 3.0 S-Terazosin 6.0 1.0 25 CM-B-CD 32.4 C4 90.0 250.0 \n3 4.0 R-Terazosin 6.0 1.0 25 CM-B-CD 32.4 C4 97.6 250.0 \n4 5.0 S-Doxazosin 5.8 1.0 25 CM-B-CD 13 C4 47.1 250.0 \n.. ... ... ... ... .. ... ... ... ... ... \n275 NaN R-Ibuprofen 4.6 1.0 40 HP-B-CD 25 C18 45.8 150.0 \n276 NaN S-Ibuprofen 4.6 1.0 50 HP-B-CD 25 C18 37.9 150.0 \n277 NaN R-Ibuprofen 4.6 1.0 50 HP-B-CD 25 C18 40.3 150.0 \n278 NaN R-citalopram 4.0 0.8 25 B-CD 20 UF-CN 27.2 250.0 \n279 NaN S-citalopram 4.0 0.8 25 B-CD 20 UF-CN 24.6 250.0 \n\n column_x solvent_polar \n0 4.6 76.8125 \n1 4.6 76.8125 \n2 4.6 78.7250 \n3 4.6 78.7250 \n4 4.6 71.5000 \n.. ... ... \n275 4.6 66.0800 \n276 4.6 66.0800 \n277 4.6 66.0800 \n278 4.6 75.7500 \n279 4.6 75.7500 \n\n[280 rows x 12 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>序号</th>\n <th>分离物质名称</th>\n <th>pH</th>\n <th>流速</th>\n <th>柱温</th>\n <th>手性添加剂</th>\n <th>添加剂用量</th>\n <th>色谱柱</th>\n <th>保留时间</th>\n <th>column_y</th>\n <th>column_x</th>\n <th>solvent_polar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.0</td>\n <td>S-Alfuzosin</td>\n <td>5.6</td>\n <td>1.0</td>\n <td>25</td>\n <td>CM-B-CD</td>\n <td>19.5</td>\n <td>C4</td>\n <td>42.8</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>76.8125</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2.0</td>\n <td>R-Alfuzosin</td>\n <td>5.6</td>\n <td>1.0</td>\n <td>25</td>\n <td>CM-B-CD</td>\n <td>19.5</td>\n <td>C4</td>\n <td>47.4</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>76.8125</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3.0</td>\n <td>S-Terazosin</td>\n <td>6.0</td>\n <td>1.0</td>\n <td>25</td>\n <td>CM-B-CD</td>\n <td>32.4</td>\n <td>C4</td>\n <td>90.0</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>78.7250</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4.0</td>\n <td>R-Terazosin</td>\n <td>6.0</td>\n <td>1.0</td>\n <td>25</td>\n <td>CM-B-CD</td>\n <td>32.4</td>\n <td>C4</td>\n <td>97.6</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>78.7250</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5.0</td>\n <td>S-Doxazosin</td>\n <td>5.8</td>\n <td>1.0</td>\n <td>25</td>\n <td>CM-B-CD</td>\n <td>13</td>\n <td>C4</td>\n <td>47.1</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>71.5000</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>275</th>\n <td>NaN</td>\n <td>R-Ibuprofen</td>\n <td>4.6</td>\n <td>1.0</td>\n <td>40</td>\n <td>HP-B-CD</td>\n <td>25</td>\n <td>C18</td>\n <td>45.8</td>\n <td>150.0</td>\n <td>4.6</td>\n <td>66.0800</td>\n </tr>\n <tr>\n <th>276</th>\n <td>NaN</td>\n <td>S-Ibuprofen</td>\n <td>4.6</td>\n <td>1.0</td>\n <td>50</td>\n <td>HP-B-CD</td>\n <td>25</td>\n <td>C18</td>\n <td>37.9</td>\n <td>150.0</td>\n <td>4.6</td>\n <td>66.0800</td>\n </tr>\n <tr>\n <th>277</th>\n <td>NaN</td>\n <td>R-Ibuprofen</td>\n <td>4.6</td>\n <td>1.0</td>\n <td>50</td>\n <td>HP-B-CD</td>\n <td>25</td>\n <td>C18</td>\n <td>40.3</td>\n <td>150.0</td>\n <td>4.6</td>\n <td>66.0800</td>\n </tr>\n <tr>\n <th>278</th>\n <td>NaN</td>\n <td>R-citalopram</td>\n <td>4.0</td>\n <td>0.8</td>\n <td>25</td>\n <td>B-CD</td>\n <td>20</td>\n <td>UF-CN</td>\n <td>27.2</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>75.7500</td>\n </tr>\n <tr>\n <th>279</th>\n <td>NaN</td>\n <td>S-citalopram</td>\n <td>4.0</td>\n <td>0.8</td>\n <td>25</td>\n <td>B-CD</td>\n <td>20</td>\n <td>UF-CN</td>\n <td>24.6</td>\n <td>250.0</td>\n <td>4.6</td>\n <td>75.7500</td>\n </tr>\n </tbody>\n</table>\n<p>280 rows × 12 columns</p>\n</div>"
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"name": "py310",
"language": "python",
"display_name": "py310"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
-- "a/\347\233\264\346\222\255/1-\347\254\254\344\270\200\351\230\266\346\256\265/1.5-\347\273\217\345\205\270\345\272\224\347\224\250-\347\273\217\345\205\270\346\234\272\345\231\250\345\255\246\344\271\240\346\250\241\345\236\213+\345\210\206\347\261\273\351\227\256\351\242\230/.gitkeep" ++ /dev/null
{ ++ /dev/null
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# 分类"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 导入数据集"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"import time"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"特征: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n",
"类别: ['setosa' 'versicolor' 'virginica']\n",
"数据形状: (150, 4)\n"
]
}
],
"source": [
"# 加载鸢尾花数据集\n",
"iris = datasets.load_iris()\n",
"X = iris.data # 特征矩阵 (150个样本 × 4个特征)\n",
"y = iris.target # 目标向量 (类别标签)\n",
"\n",
"# 特征名称和目标类别名称\n",
"feature_names = iris.feature_names\n",
"class_names = iris.target_names\n",
"\n",
"print(\"特征:\", feature_names)\n",
"print(\"类别:\", class_names)\n",
"print(\"数据形状:\", X.shape)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"# 划分训练集和测试集 (70%训练, 30%测试)\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=42\n",
")\n",
"\n",
"# 特征标准化\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## KNN"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"# 创建KNN分类器 (k=3)\n",
"knn = KNeighborsClassifier(n_neighbors=3)\n",
"\n",
"# 记录训练时间\n",
"start_time = time.time()\n",
"# 训练模型\n",
"knn.fit(X_train, y_train)\n",
"training_time = time.time() - start_time\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": "0.0018961429595947266"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_time"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 100.00%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 19\n",
" versicolor 1.00 1.00 1.00 13\n",
" virginica 1.00 1.00 1.00 13\n",
"\n",
" accuracy 1.00 45\n",
" macro avg 1.00 1.00 1.00 45\n",
"weighted avg 1.00 1.00 1.00 45\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[19 0 0]\n",
" [ 0 13 0]\n",
" [ 0 0 13]]\n"
]
}
],
"source": [
"# 预测测试集\n",
"y_pred = knn.predict(X_test)\n",
"\n",
"def evaluate(y_test,y_pred):\n",
" # 评估指标\n",
" print(\"测试集准确率: {:.2f}%\".format(accuracy_score(y_test, y_pred) * 100))\n",
" print(\"\\n分类报告:\")\n",
" print(classification_report(y_test, y_pred, target_names=class_names))\n",
"\n",
" print(\"\\n混淆矩阵:\")\n",
" print(confusion_matrix(y_test, y_pred))\n",
"\n",
"evaluate(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 94.29%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 31\n",
" versicolor 0.90 0.95 0.92 37\n",
" virginica 0.94 0.89 0.92 37\n",
"\n",
" accuracy 0.94 105\n",
" macro avg 0.95 0.95 0.95 105\n",
"weighted avg 0.94 0.94 0.94 105\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[31 0 0]\n",
" [ 0 35 2]\n",
" [ 0 4 33]]\n"
]
}
],
"source": [
"# 预测测试集\n",
"y_pred = knn.predict(X_train)\n",
"\n",
"# 评估指标\n",
"evaluate(y_train, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 随机森林"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 26,
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"# 创建和训练随机森林模型\n",
"rf = RandomForestClassifier(\n",
" n_estimators=100, # 树的数量\n",
" random_state=42, # 保证可复现性\n",
" max_features='sqrt', # 每棵树选择特征的比例\n",
" oob_score=True # 使用袋外样本评估\n",
")\n",
"\n",
"# 记录训练时间\n",
"start_time = time.time()\n",
"rf.fit(X_train, y_train)\n",
"training_time = time.time() - start_time\n",
"\n",
"# 模型评估\n",
"# 预测\n",
"y_pred = rf.predict(X_test)\n",
"y_proba = rf.predict_proba(X_test)\n",
"\n",
"# 基础指标\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"oob_accuracy = rf.oob_score_"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 27,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 100.00%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 19\n",
" versicolor 1.00 1.00 1.00 13\n",
" virginica 1.00 1.00 1.00 13\n",
"\n",
" accuracy 1.00 45\n",
" macro avg 1.00 1.00 1.00 45\n",
"weighted avg 1.00 1.00 1.00 45\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[19 0 0]\n",
" [ 0 13 0]\n",
" [ 0 0 13]]\n"
]
}
],
"source": [
"# 评估指标\n",
"evaluate(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 28,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 100.00%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 31\n",
" versicolor 1.00 1.00 1.00 37\n",
" virginica 1.00 1.00 1.00 37\n",
"\n",
" accuracy 1.00 105\n",
" macro avg 1.00 1.00 1.00 105\n",
"weighted avg 1.00 1.00 1.00 105\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[31 0 0]\n",
" [ 0 37 0]\n",
" [ 0 0 37]]\n"
]
}
],
"source": [
"# 训练集\n",
"y_pred = rf.predict(X_train)\n",
"\n",
"# 评估指标\n",
"evaluate(y_train, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## SVM"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 29,
"outputs": [
{
"data": {
"text/plain": "SVC(C=10, gamma=0.1, probability=True)",
"text/html": "<style>#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>SVC(C=10, gamma=0.1, probability=True)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SVC</label><div class=\"sk-toggleable__content\"><pre>SVC(C=10, gamma=0.1, probability=True)</pre></div></div></div></div></div>"
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#SVM\n",
"from sklearn.svm import SVC\n",
"\n",
"# 非线性问题使用RBF核\n",
"svc_rbf = SVC(kernel='rbf', C=10, gamma=0.1, probability=True)\n",
"svc_rbf.fit(X_train, y_train)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 30,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 97.14%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 31\n",
" versicolor 1.00 0.92 0.96 37\n",
" virginica 0.93 1.00 0.96 37\n",
"\n",
" accuracy 0.97 105\n",
" macro avg 0.97 0.97 0.97 105\n",
"weighted avg 0.97 0.97 0.97 105\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[31 0 0]\n",
" [ 0 34 3]\n",
" [ 0 0 37]]\n"
]
}
],
"source": [
"# 训练集\n",
"y_pred = svc_rbf.predict(X_train)\n",
"\n",
"# 评估指标\n",
"evaluate(y_train, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 31,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"测试集准确率: 97.78%\n",
"\n",
"分类报告:\n",
" precision recall f1-score support\n",
"\n",
" setosa 1.00 1.00 1.00 19\n",
" versicolor 1.00 0.92 0.96 13\n",
" virginica 0.93 1.00 0.96 13\n",
"\n",
" accuracy 0.98 45\n",
" macro avg 0.98 0.97 0.97 45\n",
"weighted avg 0.98 0.98 0.98 45\n",
"\n",
"\n",
"混淆矩阵:\n",
"[[19 0 0]\n",
" [ 0 12 1]\n",
" [ 0 0 13]]\n"
]
}
],
"source": [
"# 非线性问题使用RBF核\n",
"svc_linear = SVC(kernel='linear', C=1.0, probability=True)\n",
"svc_linear.fit(X_train, y_train)\n",
"# 训练集\n",
"y_pred = svc_linear.predict(X_test)\n",
"# 评估指标\n",
"evaluate(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"# 回归"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"from sklearn.datasets import fetch_california_housing\n",
"# 加载加州房价数据集\n",
"data = fetch_california_housing()\n",
"X = data.data\n",
"y = data.target\n",
"\n",
"# 数据预处理 - 标准化特征\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"# 划分训练集和测试集 (70% 训练, 30% 测试)\n",
"X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n... ... ... ... ... ... ... ... \n20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n\n Longitude \n0 -122.23 \n1 -122.22 \n2 -122.24 \n3 -122.25 \n4 -122.25 \n... ... \n20635 -121.09 \n20636 -121.21 \n20637 -121.22 \n20638 -121.32 \n20639 -121.24 \n\n[20640 rows x 8 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>MedInc</th>\n <th>HouseAge</th>\n <th>AveRooms</th>\n <th>AveBedrms</th>\n <th>Population</th>\n <th>AveOccup</th>\n <th>Latitude</th>\n <th>Longitude</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8.3252</td>\n <td>41.0</td>\n <td>6.984127</td>\n <td>1.023810</td>\n <td>322.0</td>\n <td>2.555556</td>\n <td>37.88</td>\n <td>-122.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>8.3014</td>\n <td>21.0</td>\n <td>6.238137</td>\n <td>0.971880</td>\n <td>2401.0</td>\n <td>2.109842</td>\n <td>37.86</td>\n <td>-122.22</td>\n </tr>\n <tr>\n <th>2</th>\n <td>7.2574</td>\n <td>52.0</td>\n <td>8.288136</td>\n <td>1.073446</td>\n <td>496.0</td>\n <td>2.802260</td>\n <td>37.85</td>\n <td>-122.24</td>\n </tr>\n <tr>\n <th>3</th>\n <td>5.6431</td>\n <td>52.0</td>\n <td>5.817352</td>\n <td>1.073059</td>\n <td>558.0</td>\n <td>2.547945</td>\n <td>37.85</td>\n <td>-122.25</td>\n </tr>\n <tr>\n <th>4</th>\n <td>3.8462</td>\n <td>52.0</td>\n <td>6.281853</td>\n <td>1.081081</td>\n <td>565.0</td>\n <td>2.181467</td>\n <td>37.85</td>\n <td>-122.25</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>20635</th>\n <td>1.5603</td>\n <td>25.0</td>\n <td>5.045455</td>\n <td>1.133333</td>\n <td>845.0</td>\n <td>2.560606</td>\n <td>39.48</td>\n <td>-121.09</td>\n </tr>\n <tr>\n <th>20636</th>\n <td>2.5568</td>\n <td>18.0</td>\n <td>6.114035</td>\n <td>1.315789</td>\n <td>356.0</td>\n <td>3.122807</td>\n <td>39.49</td>\n <td>-121.21</td>\n </tr>\n <tr>\n <th>20637</th>\n <td>1.7000</td>\n <td>17.0</td>\n <td>5.205543</td>\n <td>1.120092</td>\n <td>1007.0</td>\n <td>2.325635</td>\n <td>39.43</td>\n <td>-121.22</td>\n </tr>\n <tr>\n <th>20638</th>\n <td>1.8672</td>\n <td>18.0</td>\n <td>5.329513</td>\n <td>1.171920</td>\n <td>741.0</td>\n <td>2.123209</td>\n <td>39.43</td>\n <td>-121.32</td>\n </tr>\n <tr>\n <th>20639</th>\n <td>2.3886</td>\n <td>16.0</td>\n <td>5.254717</td>\n <td>1.162264</td>\n <td>1387.0</td>\n <td>2.616981</td>\n <td>39.37</td>\n <td>-121.24</td>\n </tr>\n </tbody>\n</table>\n<p>20640 rows × 8 columns</p>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(X,columns=data.feature_names)\n",
"df"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## SVM"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"from sklearn.svm import SVR\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"\n",
"\n",
"# 初始化模型并设置参数\n",
"svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)\n",
"\n",
"# 训练模型、预测并评估\n",
"svr.fit(X_train, y_train)\n",
"# 预测\n",
"y_pred = svr.predict(X_test)\n",
"# 评估\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"r2: 0.73\n",
"mse: 0.35\n"
]
}
],
"source": [
"def r2_mse(y_test, y_pred):\n",
" mse = mean_squared_error(y_test, y_pred)\n",
" r2 = r2_score(y_test, y_pred)\n",
" print(\"r2: {:.2f}\".format(r2))\n",
" print(\"mse: {:.2f}\".format(mse))\n",
"\n",
"r2_mse(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 决策树"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"r2: 0.60\n",
"mse: 0.52\n"
]
}
],
"source": [
"from sklearn.tree import DecisionTreeRegressor\n",
"\n",
"DT = DecisionTreeRegressor(max_depth=5,\n",
" min_samples_split=5,\n",
" random_state=42)\n",
"# 训练模型、预测并评估\n",
"DT.fit(X_train, y_train)\n",
"# 预测\n",
"y_pred = DT.predict(X_test)\n",
"# 评估\n",
"r2_mse(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## 随机森林"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"r2: 0.78\n",
"mse: 0.29\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"\n",
"RF = RandomForestRegressor(n_estimators=100,\n",
" max_depth=10,\n",
" random_state=42\n",
" )\n",
"# 训练模型、预测并评估\n",
"RF.fit(X_train, y_train)\n",
"# 预测\n",
"y_pred = RF.predict(X_test)\n",
"# 评估\n",
"r2_mse(y_test, y_pred)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
\ No newline at end of file
-- "a/\347\233\264\346\222\255/1-\347\254\254\344\270\200\351\230\266\346\256\265/1.6-\347\273\217\345\205\270\345\272\224\347\224\250-\347\273\217\345\205\270\346\234\272\345\231\250\345\255\246\344\271\240\346\250\241\345\236\213+\345\233\236\345\275\222\351\227\256\351\242\230/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/1-\347\254\254\344\270\200\351\230\266\346\256\265/1.7-\347\273\217\345\205\270\345\272\224\347\224\250-\350\277\233\351\230\266\346\234\272\345\231\250\345\255\246\344\271\240\346\250\241\345\236\213+\345\210\206\347\261\273\351\227\256\351\242\230/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.1-AI+\347\247\221\347\240\224\345\210\233\346\226\260-\347\224\237\345\214\226\347\216\257\346\235\220\346\225\260\346\215\256\347\232\204\347\211\271\345\276\201\346\217\220\345\217\226/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.2-\347\273\217\345\205\270\345\272\224\347\224\250-\347\245\236\347\273\217\347\275\221\347\273\234\346\250\241\345\236\213+\347\224\237\345\214\226\347\216\257\346\235\220/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.3-AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\347\224\237\345\214\226\347\216\257\346\235\220\345\256\236\351\252\214\350\256\276\350\256\241/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.4-\347\273\217\345\205\270\345\272\224\347\224\250AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\347\224\237\347\211\251\345\255\246\347\247\221/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.5-\347\273\217\345\205\270\345\272\224\347\224\250AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\345\214\226\345\255\246\345\255\246\347\247\221/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.6-\347\273\217\345\205\270\345\272\224\347\224\250AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\347\216\257\345\242\203\345\255\246\347\247\221/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.7-\347\273\217\345\205\270\345\272\224\347\224\250AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\346\235\220\346\226\231\345\255\246\347\247\221/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/2-\347\254\254\344\272\214\351\230\266\346\256\265/2.8-AI+\347\247\221\347\240\224\345\210\233\346\226\260-AI+\347\273\223\346\236\234\345\210\206\346\236\220/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.1-\347\247\221\347\240\224\350\256\272\346\226\207\346\241\206\346\236\266\343\200\201\345\267\245\345\205\267/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.2-\347\247\221\347\240\224\350\256\272\346\226\207\345\206\231\344\275\234-\345\274\225\350\250\200/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.3-\347\247\221\347\240\224\350\256\272\346\226\207\345\206\231\344\275\234-\346\226\271\346\263\225\343\200\201\347\273\223\346\236\234/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.4-\347\247\221\347\240\224\350\256\272\346\226\207\345\206\231\344\275\234-\351\242\230\347\233\256\343\200\201\346\221\230\350\246\201\343\200\201\350\256\250\350\256\272/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.5-\347\247\221\347\240\224\350\256\272\346\226\207\345\206\231\344\275\234-\346\227\266\346\200\201\343\200\201\350\277\236\346\216\245\350\257\215/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.6-\347\247\221\347\240\224\350\256\272\346\226\207\351\200\211\345\210\212\343\200\201\346\212\225\347\250\277/.gitkeep" ++ /dev/null
-- "a/\347\233\264\346\222\255/3-\347\254\254\344\270\211\351\230\266\346\256\265/3.7-\347\247\221\347\240\224\350\256\272\346\226\207\350\277\224\344\277\256\343\200\201\346\240\241\347\250\277/.gitkeep" ++ /dev/null
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment