123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370 |
- #补全1--平均值填充
- from flask import Flask, request, jsonify
- import numpy as np
- import pandas as pd
- import os
- import requests
- app = Flask(__name__)
- @app.route('/ave_data', methods=['POST'])
- def process_data():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = data['file_url']
- # 使用requests获取文件内容
- response = requests.get(file_url)
- if response.status_code != 200:
- return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
- # 读取数据集
- df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
- # 检查是否存在时间列
- if 'Time' in df.columns:
- # 提取时间列并转换为 datetime 类型
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # 对时间列进行排序,确保时间顺序
- df.sort_values('Time', inplace=True)
- # 使用 interpolate 方法进行线性插值填充缺失的时间值
- df['Time'] = df['Time'].interpolate(method='time')
- # 如果时间列经过填充后,需要重新排序以保持原有的顺序(如果需要的话)
- # df.sort_index(inplace=True)
- # 分离数值型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- # 填充数值型数据的缺失值,使用均值填充
- for col in numerical_cols:
- df[col].fillna(df[col].mean(), inplace=True)
- # 分离分类型数据
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 填充分类型数据的缺失值,使用众数填充
- for col in categorical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 将结果保存为CSV文件
- result_file_path = os.path.join('/tmp', '补全后的数据-平均值.csv')
- df.to_csv(result_file_path, index=False, header=False)
- # 构建返回数据,只返回文件路径
- return jsonify({
- 'code': 200,
- 'msg': '文件处理完成',
- 'file_path': result_file_path
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=8081, host='0.0.0.0')
- # #补全2--中位数填充
- from flask import Flask, request, jsonify
- import numpy as np
- import pandas as pd
- import os
- import requests
- app = Flask(__name__)
- @app.route('/upper_data', methods=['POST'])
- def process_data():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = data['file_url']
- # 使用requests获取文件内容
- response = requests.get(file_url)
- if response.status_code != 200:
- return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
- # 读取数据集
- df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
- # 检查是否存在时间列
- if 'Time' in df.columns:
- # 提取时间列并转换为 datetime 类型
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # 对时间列进行排序,确保时间顺序
- df.sort_values('Time', inplace=True)
- # 使用 interpolate 方法进行线性插值填充缺失的时间值
- df['Time'] = df['Time'].interpolate(method='time')
- # 分离数值型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- # 填充数值型数据的缺失值,使用中位数填充
- for col in numerical_cols:
- df[col].fillna(df[col].median(), inplace=True)
- # 分离分类型数据
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 填充分类型数据的缺失值,使用众数填充
- for col in categorical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 将结果保存为CSV文件
- result_file_path = os.path.join('/tmp', '补全后的数据-中位数.csv')
- df.to_csv(result_file_path, index=False, header=False)
- # 构建返回数据,只返回文件路径
- return jsonify({
- 'code': 200,
- 'msg': '文件处理完成',
- 'file_path': result_file_path
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=8081, host='0.0.0.0')
- # 补全3--众数填充
- from flask import Flask, request, jsonify
- import numpy as np
- import pandas as pd
- import os
- import requests
- app = Flask(__name__)
- @app.route('/mode_data', methods=['POST'])
- def process_data():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = data['file_url']
- # 使用requests获取文件内容
- response = requests.get(file_url)
- if response.status_code != 200:
- return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
- # 读取数据集
- df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
- # 检查是否存在时间列
- if 'Time' in df.columns:
- # 提取时间列并转换为 datetime 类型
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # 对时间列进行排序,确保时间顺序
- df.sort_values('Time', inplace=True)
- # 使用 interpolate 方法进行线性插值填充缺失的时间值
- df['Time'] = df['Time'].interpolate(method='time')
- # 分离数值型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- # 填充数值型数据的缺失值,使用众数填充
- for col in numerical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 分离分类型数据
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 填充分类型数据的缺失值,使用众数填充
- for col in categorical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 将结果保存为CSV文件
- result_file_path = os.path.join('/tmp', '补全后的数据-众数.csv')
- df.to_csv(result_file_path, index=False, header=False)
- # 构建返回数据,只返回文件路径
- return jsonify({
- 'code': 200,
- 'msg': '文件处理完成',
- 'file_path': result_file_path
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=8081, host='0.0.0.0')
- # #补全4--常数填充
- from flask import Flask, request, jsonify
- import pandas as pd
- import numpy as np
- import os
- import requests
- app = Flask(__name__)
- @app.route('/constant_data', methods=['POST'])
- def process_data():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = data['file_url']
- # 使用requests获取文件内容
- response = requests.get(file_url)
- if response.status_code != 200:
- return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
- # 读取数据集
- df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
- # 检查是否存在时间列
- if 'Time' in df.columns:
- # 提取时间列并转换为 datetime 类型
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- # 对时间列进行排序,确保时间顺序
- df.sort_values('Time', inplace=True)
- # 使用 interpolate 方法进行线性插值填充缺失的时间值
- df['Time'] = df['Time'].interpolate(method='time')
- # 分离数值型数据和分类型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 定义常数填充值,这里以0为例
- constant_value = 0
- # 填充数值型数据的缺失值,使用常数填充
- for col in numerical_cols:
- df[col].fillna(constant_value, inplace=True)
- # 填充分类型数据的缺失值,使用常数0填充
- constant_value_categorical = '0' # 或者其他认为合适的值
- for col in categorical_cols:
- df[col].fillna(constant_value_categorical, inplace=True)
- # 将结果保存为CSV文件
- result_file_path = os.path.join('/tmp', '补全后的数据-常数.csv')
- df.to_csv(result_file_path, index=False, header=False)
- # 构建返回数据,只返回文件路径
- return jsonify({
- 'code': 200,
- 'msg': '文件处理完成',
- 'file_path': result_file_path
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=8081, host='0.0.0.0')
- # #补全5--K-最近邻填充
- from flask import Flask, request, jsonify
- import pandas as pd
- import numpy as np
- from sklearn.impute import KNNImputer
- import os
- import requests
- app = Flask(__name__)
- @app.route('/K_data', methods=['POST'])
- def process_data():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = data['file_url']
- # 使用requests获取文件内容
- response = requests.get(file_url)
- if response.status_code != 200:
- return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
- # 读取数据集
- df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
- # 检查是否存在时间列并处理
- if 'Time' in df.columns:
- df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
- df.sort_values('Time', inplace=True)
- df['Time'] = df['Time'].interpolate(method='time')
- # 分离数值型数据
- numerical_cols = df.select_dtypes(include=[np.number]).columns
- # 使用 KNN 填充数值型数据的缺失值
- imputer = KNNImputer(n_neighbors=2)
- df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
- # 分离分类型数据
- categorical_cols = df.select_dtypes(include=['object']).columns
- # 填充分类型数据的缺失值,使用众数填充
- for col in categorical_cols:
- df[col].fillna(df[col].mode()[0], inplace=True)
- # 将结果保存为CSV文件
- result_file_path = os.path.join('/tmp', '补全后的数据-最近邻.csv')
- df.to_csv(result_file_path, index=False, header=False)
- # 构建返回数据,只返回文件路径
- return jsonify({
- 'code': 200,
- 'msg': '文件处理完成',
- 'file_path': result_file_path
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=8081, host='0.0.0.0')
|