123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207 |
- # import matplotlib.pyplot as plt
- # from matplotlib import rcParams
- # plt.rcParams['xtick.direction'] = 'in'
- # plt.rcParams['ytick.direction'] = 'in'
- # plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
- # plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
- # config = {
- # "font.family": 'serif',
- # "font.size": 20,
- # "mathtext.fontset": 'stix',
- # "font.serif": ['Times New Roman'],#宋体
- # 'axes.unicode_minus': False # 处理负号
- # }
- # rcParams.update(config)
- #补全1--平均值填充
- import numpy as np
- import pandas as pd
- # 读取数据集
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-无time.csv', header=None)
- df = pd.read_csv(r'D:\验收材料\空工大-装备性能退化评估和故障预测健康管理软件\里程碑最终算法\数据缺失-time.csv', header=None)
- # 检查是否存在时间列
- if 'Time' in df.columns:
- # 提取时间列并转换为 datetime 类型
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # 对时间列进行排序,确保时间顺序
- df.sort_values('Time', inplace=True)
- # 使用 interpolate 方法进行线性插值填充缺失的时间值
- df['Time'] = df['Time'].interpolate(method='time')
- # 如果时间列经过填充后,需要重新排序以保持原有的顺序(如果需要的话)
- # df.sort_index(inplace=True)
- # 分离数值型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- # 填充数值型数据的缺失值,使用均值填充
- for col in numerical_cols:
- df[col].fillna(df[col].mean(), inplace=True)
- # 分离分类型数据
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 填充分类型数据的缺失值,使用平均值填充
- for col in categorical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 将结果保存为CSV文件
- df.to_csv('补全后的数据.csv', index=False, header=False)
- # # # #补全2--中位数填充
- # # import numpy as np
- # # import pandas as pd
- # #
- # # # 读取数据集
- # # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-无time.csv', header=None)
- # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-time.csv', header=None)
- # #
- # # # 检查是否存在时间列
- # # if 'Time' in df.columns:
- # # # 提取时间列并转换为 datetime 类型
- # # df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # #
- # # # 对时间列进行排序,确保时间顺序
- # # df.sort_values('Time', inplace=True)
- # #
- # # # 使用 interpolate 方法进行线性插值填充缺失的时间值
- # # df['Time'] = df['Time'].interpolate(method='time')
- # #
- # # # 分离数值型数据
- # # numerical_cols = df.select_dtypes(include=[np.number]).columns
- # #
- # # # 填充数值型数据的缺失值,使用中位数填充
- # # for col in numerical_cols:
- # # df[col].fillna(df[col].median(), inplace=True)
- # #
- # # # 分离分类型数据
- # # categorical_cols = df.select_dtypes(include=['object']).columns
- # #
- # # # 填充分类型数据的缺失值,使用中位数填充
- # # for col in categorical_cols:
- # # df[col].fillna(df[col].mode()[0], inplace=True)
- # #
- # # # 将结果保存为CSV文件
- # # df.to_csv('补全后的数据.csv', index=False, header=False)
- # ## 补全3--众数填充
- # # import numpy as np
- # # import pandas as pd
- # #
- # # # 读取数据集
- # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-无time.csv', header=None)
- # # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-time.csv', header=None)
- # #
- # # # 检查是否存在时间列
- # # if 'Time' in df.columns:
- # # # 提取时间列并转换为 datetime 类型
- # # df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # #
- # # # 对时间列进行排序,确保时间顺序
- # # df.sort_values('Time', inplace=True)
- # #
- # # # 使用 interpolate 方法进行线性插值填充缺失的时间值
- # # df['Time'] = df['Time'].interpolate(method='time')
- # #
- # # # 分离数值型数据
- # # numerical_cols = df.select_dtypes(include=[np.number]).columns
- # #
- # # # 填充数值型数据的缺失值,使用众数填充
- # # for col in numerical_cols:
- # # df[col].fillna(df[col].mode()[0], inplace=True)
- # #
- # # # 分离分类型数据
- # # categorical_cols = df.select_dtypes(include=['object']).columns
- # #
- # # # 填充分类型数据的缺失值,使用众数填充
- # # for col in categorical_cols:
- # # df[col].fillna(df[col].mode()[0], inplace=True)
- # #
- # # # 将结果保存为CSV文件
- # # df.to_csv('补全后的数据.csv', index=False, header=False)
- # # # #补全4--常数填充
- # # import numpy as np
- # # import pandas as pd
- # #
- # # # 读取数据集
- # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-无time.csv', header=None)
- # # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-time.csv', header=None)
- # #
- # # # 检查是否存在时间列
- # # if 'Time' in df.columns:
- # # # 提取时间列并转换为 datetime 类型
- # # df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # #
- # # # 对时间列进行排序,确保时间顺序
- # # df.sort_values('Time', inplace=True)
- # #
- # # # 使用 interpolate 方法进行线性插值填充缺失的时间值
- # # df['Time'] = df['Time'].interpolate(method='time')
- # #
- # # # 分离数值型数据和分类型数据
- # # numerical_cols = df.select_dtypes(include=[np.number]).columns
- # # categorical_cols = df.select_dtypes(include=['object']).columns
- # #
- # # # 定义常数填充值,这里以0为例
- # # constant_value = 0
- # #
- # # # 填充数值型数据的缺失值,使用常数填充
- # # for col in numerical_cols:
- # # df[col].fillna(constant_value, inplace=True)
- # #
- # # # 填充分类型数据的缺失值,使用常数0填充
- # # constant_value_categorical = '0' # 或者其他认为合适的值
- # # for col in categorical_cols:
- # # df[col].fillna(constant_value_categorical, inplace=True)
- # #
- # # # 将结果保存为CSV文件
- # # df.to_csv('补全后的数据.csv', index=False, header=False)
- # # #补全5--K-最近邻填充
- # import numpy as np
- # import pandas as pd
- # from sklearn.impute import KNNImputer
- # # 读取数据集
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-无time.csv')
- # # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\调试\数据缺失-time.csv')
- # # 检查是否存在时间列并处理
- # if 'Time' in df.columns:
- # df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # df.sort_values('Time', inplace=True)
- # df['Time'] = df['Time'].interpolate(method='time')
- # # 分离数值型数据
- # numerical_cols = df.select_dtypes(include=[np.number]).columns
- # # 使用 KNN 填充数值型数据的缺失值
- # imputer = KNNImputer(n_neighbors=2)
- # df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
- # # 分离分类型数据
- # categorical_cols = df.select_dtypes(include=['object']).columns
- # # 填充分类型数据的缺失值,使用最近邻填充
- # for col in categorical_cols:
- # df[col].fillna(df[col].mode()[0], inplace=True)
- # # 将结果保存为CSV文件
- # df.to_csv('补全后的数据.csv', index=False, header=False)
|