补全.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. #补全1--平均值填充
  2. from flask import Flask, request, jsonify
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import requests
  7. app = Flask(__name__)
  8. @app.route('/ave_data', methods=['POST'])
  9. def process_data():
  10. try:
  11. # 检查请求体中是否包含文件地址
  12. data = request.json
  13. if 'file_url' not in data:
  14. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  15. file_url = data['file_url']
  16. # 使用requests获取文件内容
  17. response = requests.get(file_url)
  18. if response.status_code != 200:
  19. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  20. # 读取数据集
  21. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  22. # 检查是否存在时间列
  23. if 'Time' in df.columns:
  24. # 提取时间列并转换为 datetime 类型
  25. df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
  26. # 对时间列进行排序,确保时间顺序
  27. df.sort_values('Time', inplace=True)
  28. # 使用 interpolate 方法进行线性插值填充缺失的时间值
  29. df['Time'] = df['Time'].interpolate(method='time')
  30. # 如果时间列经过填充后,需要重新排序以保持原有的顺序(如果需要的话)
  31. # df.sort_index(inplace=True)
  32. # 分离数值型数据
  33. numerical_cols = df.select_dtypes(include=[np.number]).columns
  34. # 填充数值型数据的缺失值,使用均值填充
  35. for col in numerical_cols:
  36. df[col].fillna(df[col].mean(), inplace=True)
  37. # 分离分类型数据
  38. categorical_cols = df.select_dtypes(include=['object']).columns
  39. # 填充分类型数据的缺失值,使用众数填充
  40. for col in categorical_cols:
  41. df[col].fillna(df[col].mode()[0], inplace=True)
  42. # 将结果保存为CSV文件
  43. result_file_path = os.path.join('/tmp', '补全后的数据-平均值.csv')
  44. df.to_csv(result_file_path, index=False, header=False)
  45. # 构建返回数据,只返回文件路径
  46. return jsonify({
  47. 'code': 200,
  48. 'msg': '文件处理完成',
  49. 'file_path': result_file_path
  50. })
  51. except Exception as e:
  52. return jsonify({
  53. 'code': 500,
  54. 'msg': str(e)
  55. })
  56. if __name__ == '__main__':
  57. app.run(debug=True, port=8081, host='0.0.0.0')
  58. # #补全2--中位数填充
  59. from flask import Flask, request, jsonify
  60. import numpy as np
  61. import pandas as pd
  62. import os
  63. import requests
  64. app = Flask(__name__)
  65. @app.route('/upper_data', methods=['POST'])
  66. def process_data():
  67. try:
  68. # 检查请求体中是否包含文件地址
  69. data = request.json
  70. if 'file_url' not in data:
  71. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  72. file_url = data['file_url']
  73. # 使用requests获取文件内容
  74. response = requests.get(file_url)
  75. if response.status_code != 200:
  76. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  77. # 读取数据集
  78. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  79. # 检查是否存在时间列
  80. if 'Time' in df.columns:
  81. # 提取时间列并转换为 datetime 类型
  82. df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
  83. # 对时间列进行排序,确保时间顺序
  84. df.sort_values('Time', inplace=True)
  85. # 使用 interpolate 方法进行线性插值填充缺失的时间值
  86. df['Time'] = df['Time'].interpolate(method='time')
  87. # 分离数值型数据
  88. numerical_cols = df.select_dtypes(include=[np.number]).columns
  89. # 填充数值型数据的缺失值,使用中位数填充
  90. for col in numerical_cols:
  91. df[col].fillna(df[col].median(), inplace=True)
  92. # 分离分类型数据
  93. categorical_cols = df.select_dtypes(include=['object']).columns
  94. # 填充分类型数据的缺失值,使用众数填充
  95. for col in categorical_cols:
  96. df[col].fillna(df[col].mode()[0], inplace=True)
  97. # 将结果保存为CSV文件
  98. result_file_path = os.path.join('/tmp', '补全后的数据-中位数.csv')
  99. df.to_csv(result_file_path, index=False, header=False)
  100. # 构建返回数据,只返回文件路径
  101. return jsonify({
  102. 'code': 200,
  103. 'msg': '文件处理完成',
  104. 'file_path': result_file_path
  105. })
  106. except Exception as e:
  107. return jsonify({
  108. 'code': 500,
  109. 'msg': str(e)
  110. })
  111. if __name__ == '__main__':
  112. app.run(debug=True, port=8081, host='0.0.0.0')
  113. # 补全3--众数填充
  114. from flask import Flask, request, jsonify
  115. import numpy as np
  116. import pandas as pd
  117. import os
  118. import requests
  119. app = Flask(__name__)
  120. @app.route('/mode_data', methods=['POST'])
  121. def process_data():
  122. try:
  123. # 检查请求体中是否包含文件地址
  124. data = request.json
  125. if 'file_url' not in data:
  126. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  127. file_url = data['file_url']
  128. # 使用requests获取文件内容
  129. response = requests.get(file_url)
  130. if response.status_code != 200:
  131. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  132. # 读取数据集
  133. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  134. # 检查是否存在时间列
  135. if 'Time' in df.columns:
  136. # 提取时间列并转换为 datetime 类型
  137. df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
  138. # 对时间列进行排序,确保时间顺序
  139. df.sort_values('Time', inplace=True)
  140. # 使用 interpolate 方法进行线性插值填充缺失的时间值
  141. df['Time'] = df['Time'].interpolate(method='time')
  142. # 分离数值型数据
  143. numerical_cols = df.select_dtypes(include=[np.number]).columns
  144. # 填充数值型数据的缺失值,使用众数填充
  145. for col in numerical_cols:
  146. df[col].fillna(df[col].mode()[0], inplace=True)
  147. # 分离分类型数据
  148. categorical_cols = df.select_dtypes(include=['object']).columns
  149. # 填充分类型数据的缺失值,使用众数填充
  150. for col in categorical_cols:
  151. df[col].fillna(df[col].mode()[0], inplace=True)
  152. # 将结果保存为CSV文件
  153. result_file_path = os.path.join('/tmp', '补全后的数据-众数.csv')
  154. df.to_csv(result_file_path, index=False, header=False)
  155. # 构建返回数据,只返回文件路径
  156. return jsonify({
  157. 'code': 200,
  158. 'msg': '文件处理完成',
  159. 'file_path': result_file_path
  160. })
  161. except Exception as e:
  162. return jsonify({
  163. 'code': 500,
  164. 'msg': str(e)
  165. })
  166. if __name__ == '__main__':
  167. app.run(debug=True, port=8081, host='0.0.0.0')
  168. # #补全4--常数填充
  169. from flask import Flask, request, jsonify
  170. import pandas as pd
  171. import numpy as np
  172. import os
  173. import requests
  174. app = Flask(__name__)
  175. @app.route('/constant_data', methods=['POST'])
  176. def process_data():
  177. try:
  178. # 检查请求体中是否包含文件地址
  179. data = request.json
  180. if 'file_url' not in data:
  181. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  182. file_url = data['file_url']
  183. # 使用requests获取文件内容
  184. response = requests.get(file_url)
  185. if response.status_code != 200:
  186. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  187. # 读取数据集
  188. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  189. # 检查是否存在时间列
  190. if 'Time' in df.columns:
  191. # 提取时间列并转换为 datetime 类型
  192. df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
  193. # 对时间列进行排序,确保时间顺序
  194. df.sort_values('Time', inplace=True)
  195. # 使用 interpolate 方法进行线性插值填充缺失的时间值
  196. df['Time'] = df['Time'].interpolate(method='time')
  197. # 分离数值型数据和分类型数据
  198. numerical_cols = df.select_dtypes(include=[np.number]).columns
  199. categorical_cols = df.select_dtypes(include=['object']).columns
  200. # 定义常数填充值,这里以0为例
  201. constant_value = 0
  202. # 填充数值型数据的缺失值,使用常数填充
  203. for col in numerical_cols:
  204. df[col].fillna(constant_value, inplace=True)
  205. # 填充分类型数据的缺失值,使用常数0填充
  206. constant_value_categorical = '0' # 或者其他认为合适的值
  207. for col in categorical_cols:
  208. df[col].fillna(constant_value_categorical, inplace=True)
  209. # 将结果保存为CSV文件
  210. result_file_path = os.path.join('/tmp', '补全后的数据-常数.csv')
  211. df.to_csv(result_file_path, index=False, header=False)
  212. # 构建返回数据,只返回文件路径
  213. return jsonify({
  214. 'code': 200,
  215. 'msg': '文件处理完成',
  216. 'file_path': result_file_path
  217. })
  218. except Exception as e:
  219. return jsonify({
  220. 'code': 500,
  221. 'msg': str(e)
  222. })
  223. if __name__ == '__main__':
  224. app.run(debug=True, port=8081, host='0.0.0.0')
  225. # #补全5--K-最近邻填充
  226. from flask import Flask, request, jsonify
  227. import pandas as pd
  228. import numpy as np
  229. from sklearn.impute import KNNImputer
  230. import os
  231. import requests
  232. app = Flask(__name__)
  233. @app.route('/K_data', methods=['POST'])
  234. def process_data():
  235. try:
  236. # 检查请求体中是否包含文件地址
  237. data = request.json
  238. if 'file_url' not in data:
  239. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  240. file_url = data['file_url']
  241. # 使用requests获取文件内容
  242. response = requests.get(file_url)
  243. if response.status_code != 200:
  244. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  245. # 读取数据集
  246. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  247. # 检查是否存在时间列并处理
  248. if 'Time' in df.columns:
  249. df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
  250. df.sort_values('Time', inplace=True)
  251. df['Time'] = df['Time'].interpolate(method='time')
  252. # 分离数值型数据
  253. numerical_cols = df.select_dtypes(include=[np.number]).columns
  254. # 使用 KNN 填充数值型数据的缺失值
  255. imputer = KNNImputer(n_neighbors=2)
  256. df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
  257. # 分离分类型数据
  258. categorical_cols = df.select_dtypes(include=['object']).columns
  259. # 填充分类型数据的缺失值,使用众数填充
  260. for col in categorical_cols:
  261. df[col].fillna(df[col].mode()[0], inplace=True)
  262. # 将结果保存为CSV文件
  263. result_file_path = os.path.join('/tmp', '补全后的数据-最近邻.csv')
  264. df.to_csv(result_file_path, index=False, header=False)
  265. # 构建返回数据,只返回文件路径
  266. return jsonify({
  267. 'code': 200,
  268. 'msg': '文件处理完成',
  269. 'file_path': result_file_path
  270. })
  271. except Exception as e:
  272. return jsonify({
  273. 'code': 500,
  274. 'msg': str(e)
  275. })
  276. if __name__ == '__main__':
  277. app.run(debug=True, port=8081, host='0.0.0.0')