Commit b9ee4a9b by 前钰

Upload New File

parent f3ff6d55
{
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c129fa56",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16</td>\n",
" <td>88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>16</td>\n",
" <td>105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15</td>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1005</td>\n",
" <td>钱七</td>\n",
" <td>18</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16 88\n",
"1 1002 李四 17 92\n",
"2 1002 李四 17 92\n",
"3 1003 王五 16 105\n",
"4 1004 赵六 15 76\n",
"5 1005 钱七 18 -5"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"data = {\n",
" 'student_id': [1001, 1002, 1002, 1003, 1004, 1005],\n",
" 'name': ['张三', '李四', '李四', '王五', '赵六', '钱七'],\n",
" 'age': [16, 17, 17, 16, 15, 18],\n",
" 'score': [88, 92, 92, 105, 76, -5]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "98fac49b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"2 1002 李四 17 92"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.duplicated()] # 筛选出所有重复的行(只保留不是第一次出现的那些行)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fdd9d74a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16</td>\n",
" <td>88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>16</td>\n",
" <td>105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15</td>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1005</td>\n",
" <td>钱七</td>\n",
" <td>18</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16 88\n",
"1 1002 李四 17 92\n",
"3 1003 王五 16 105\n",
"4 1004 赵六 15 76\n",
"5 1005 钱七 18 -5"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.drop_duplicates() # 删除所有重复的行,只保留第一次出现的那一行,返回去重后的新DataFrame,并赋值回原变量 df\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b9ad4feb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>16</td>\n",
" <td>105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1005</td>\n",
" <td>钱七</td>\n",
" <td>18</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"3 1003 王五 16 105\n",
"5 1005 钱七 18 -5"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outliers = df[(df['score'] < 0) | (df['score'] > 100)] # # 找出 'score' 列中小于0或大于100的异常值行\n",
"outliers"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "48b96818",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16</td>\n",
" <td>88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15</td>\n",
" <td>76</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16 88\n",
"1 1002 李四 17 92\n",
"4 1004 赵六 15 76"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_clean = df[(df['score'] >= 0) & (df['score'] <= 100)] # # 筛选出 'score' 列在 0 到 100 之间的正常值行,去除异常数据,并保存为 df_clean\n",
"df_clean"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9d4126a5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16</td>\n",
" <td>88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>16</td>\n",
" <td>105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15</td>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1005</td>\n",
" <td>钱七</td>\n",
" <td>18</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16 88\n",
"1 1002 李四 17 92\n",
"3 1003 王五 16 105\n",
"4 1004 赵六 15 76\n",
"5 1005 钱七 18 -5"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3c426034",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\10622\\AppData\\Local\\Temp\\ipykernel_31228\\2422624131.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df['score'] = df['score'].apply(lambda x: mean_score if x < 0 or x > 100 else x)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16</td>\n",
" <td>88.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17</td>\n",
" <td>92.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>16</td>\n",
" <td>85.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15</td>\n",
" <td>76.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1005</td>\n",
" <td>钱七</td>\n",
" <td>18</td>\n",
" <td>85.333333</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16 88.000000\n",
"1 1002 李四 17 92.000000\n",
"3 1003 王五 16 85.333333\n",
"4 1004 赵六 15 76.000000\n",
"5 1005 钱七 18 85.333333"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 替换异常值(用均值替换)\n",
"mean_score = df_clean['score'].mean()\n",
"df['score'] = df['score'].apply(lambda x: mean_score if x < 0 or x > 100 else x)\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "deb93909",
"metadata": {},
"source": [
"## 处理缺失值"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "90760e40",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" student_id name age score\n",
"0 1001 张三 16.0 88.0\n",
"1 1002 李四 17.0 NaN\n",
"2 1003 王五 NaN 105.0\n",
"3 1004 赵六 15.0 76.0\n",
"4 1005 None 18.0 92.0\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = {\n",
" 'student_id': [1001, 1002, 1003, 1004, 1005],\n",
" 'name': ['张三', '李四', '王五', '赵六', None],\n",
" 'age': [16, 17, None, 15, 18],\n",
" 'score': [88, np.nan, 105, 76, 92]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"print(df)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "807b55c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" student_id name age score\n",
"0 False False False False\n",
"1 False False False True\n",
"2 False False True False\n",
"3 False False False False\n",
"4 False True False False\n",
"student_id 0\n",
"name 1\n",
"age 1\n",
"score 1\n",
"dtype: int64\n"
]
}
],
"source": [
"# 检查哪些位置是缺失的\n",
"print(df.isnull())\n",
"\n",
"# 每列有多少缺失\n",
"print(df.isnull().sum())\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ad999031",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16.0</td>\n",
" <td>88.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>NaN</td>\n",
" <td>105.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15.0</td>\n",
" <td>76.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>18.0</td>\n",
" <td>92.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16.0 88.0\n",
"1 1002 李四 17.0 NaN\n",
"2 1003 王五 NaN 105.0\n",
"3 1004 赵六 15.0 76.0\n",
"4 1005 None 18.0 92.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_drop = df.dropna() # 删除含缺失值的行\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fc18d0bd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16.0</td>\n",
" <td>88.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17.0</td>\n",
" <td>90.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>NaN</td>\n",
" <td>105.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15.0</td>\n",
" <td>76.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1005</td>\n",
" <td>None</td>\n",
" <td>18.0</td>\n",
" <td>92.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16.0 88.00\n",
"1 1002 李四 17.0 90.25\n",
"2 1003 王五 NaN 105.00\n",
"3 1004 赵六 15.0 76.00\n",
"4 1005 None 18.0 92.00"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['score'] = df['score'].fillna(df['score'].mean()) # 填充缺失值\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "fd2f2869",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16.0</td>\n",
" <td>88.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17.0</td>\n",
" <td>90.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>10.0</td>\n",
" <td>105.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15.0</td>\n",
" <td>76.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1005</td>\n",
" <td>未知</td>\n",
" <td>18.0</td>\n",
" <td>92.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16.0 88.00\n",
"1 1002 李四 17.0 90.25\n",
"2 1003 王五 10.0 105.00\n",
"3 1004 赵六 15.0 76.00\n",
"4 1005 未知 18.0 92.00"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['age'] = df['age'].fillna(10) # 填固定值(如年龄填0、姓名填“未知”)\n",
"df['name'] = df['name'].fillna('未知')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a89f866f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" student_id name age score\n",
"0 1001 张三 16.0 88.0\n",
"1 1002 李四 17.0 NaN\n",
"2 1003 王五 NaN 105.0\n",
"3 1004 赵六 15.0 76.0\n",
"4 1005 None 18.0 92.0\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = {\n",
" 'student_id': [1001, 1002, 1003, 1004, 1005],\n",
" 'name': ['张三', '李四', '王五', '赵六', None],\n",
" 'age': [16, 17, None, 15, 18],\n",
" 'score': [88, np.nan, 105, 76, 92]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"print(df)\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "b76bc377",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\10622\\AppData\\Local\\Temp\\ipykernel_31228\\1427996482.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
" df.fillna(method='ffill', inplace=True) # 用前一条记录填充\n",
"C:\\Users\\10622\\AppData\\Local\\Temp\\ipykernel_31228\\1427996482.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
" df.fillna(method='bfill', inplace=True) # 用后一条记录填充\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>student_id</th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001</td>\n",
" <td>张三</td>\n",
" <td>16.0</td>\n",
" <td>88.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1002</td>\n",
" <td>李四</td>\n",
" <td>17.0</td>\n",
" <td>88.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1003</td>\n",
" <td>王五</td>\n",
" <td>17.0</td>\n",
" <td>105.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1004</td>\n",
" <td>赵六</td>\n",
" <td>15.0</td>\n",
" <td>76.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1005</td>\n",
" <td>赵六</td>\n",
" <td>18.0</td>\n",
" <td>92.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" student_id name age score\n",
"0 1001 张三 16.0 88.0\n",
"1 1002 李四 17.0 88.0\n",
"2 1003 王五 17.0 105.0\n",
"3 1004 赵六 15.0 76.0\n",
"4 1005 赵六 18.0 92.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.fillna(method='ffill', inplace=True) # 用前一条记录填充\n",
"df.fillna(method='bfill', inplace=True) # 用后一条记录填充\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "f3c38aca",
"metadata": {},
"source": [
"## 归一化"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "bd1b040c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" score score_norm\n",
"0 45 0.000000\n",
"1 70 0.454545\n",
"2 90 0.818182\n",
"3 100 1.000000\n",
"4 60 0.272727\n"
]
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"import pandas as pd\n",
"\n",
"df = pd.DataFrame({'score': [45, 70, 90, 100, 60]})\n",
"\n",
"scaler = MinMaxScaler() # 归一化\n",
"df['score_norm'] = scaler.fit_transform(df[['score']])\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "5ea37fc1",
"metadata": {},
"source": [
"## 标准化"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "aa48d78e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" score score_std\n",
"0 45 -1.407053\n",
"1 70 -0.150756\n",
"2 90 0.854282\n",
"3 100 1.356801\n",
"4 60 -0.653275\n"
]
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"df = pd.DataFrame({'score': [45, 70, 90, 100, 60]})\n",
"\n",
"scaler = StandardScaler()\n",
"df['score_std'] = scaler.fit_transform(df[['score']])\n",
"print(df)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment