#扩充1--随机采样:是一种简单直接的数据增强方法,通过从现有数据中随机选择样本来增加数据集的大小。对于不平衡的数据集,可以通过有偏的采样来平衡类别,即对少数类的样本进行过采样,对多数类的样本进行欠采样。 from flask import Flask, request, jsonify import pandas as pd import os import requests import numpy as np import pandas as pd import pywt from flask import Flask, request, jsonify app = Flask(__name__) def extend_data_with_ordered_sampling(df, expansion_ratio=0.2): """ 通过有序采样的方式扩充整个数据集,不包括时间列(如果存在)。 :param df: 原始数据帧。 :param expansion_ratio: 扩充数据的比例,即新数据占原始数据的比例。 :return: 扩充后的数据帧。 """ # 检查是否存在 'Time' 列,如果存在,则删除 if 'Time' in df.columns: df = df.drop(columns=['Time']) # 计算需要扩充的样本数量 n_samples = int(len(df) * expansion_ratio) # 对剩余数据进行排序(假设df已经是按照时间排序的,如果没有排序需要添加排序逻辑) df_sorted = df.sort_index() # 按照顺序取出一部分数据作为扩充数据,不打乱顺序 sampled_data = df_sorted.tail(n_samples).copy() # 合并原始数据帧与采样数据帧 final_data = pd.concat([df, sampled_data], ignore_index=True) return final_data def add_random_perturbation(series, sigma): """ 对数值型序列添加随机扰动。 """ return series + np.random.normal(0, sigma, size=len(series)) def extend_data_with_perturbation(df, sigma, expansion_ratio): """ 对数据帧中的数值型列添加随机扰动,并扩充数据。 """ # 检查是否存在 'Time' 列,如果存在,则删除 if 'Time' in df.columns: df = df.drop(columns=['Time']) numerical_columns = df.select_dtypes(include=[np.number]).columns extended_data = df.copy() for col in numerical_columns: extended_data[col] = add_random_perturbation(df[col], sigma) # 计扩充的数据量 n_samples = int(len(df) * expansion_ratio) # 扩充数据 expanded_data = extended_data.iloc[-n_samples:].copy() # 合并原始数据和扩充数据 final_data = pd.concat([df, expanded_data], ignore_index=True) return final_data def wavelet_transform(series, wavelet='db1', level=1): """ 对一维数值数据进行小波变换。 """ return pywt.wavedec(series, wavelet, level=level) def wavelet_reconstruct(coeffs, wavelet='db1'): """ 使用小波变换后的系数重构数据。 """ return pywt.waverec(coeffs, wavelet) def perturb_coeffs(coeffs, sigma): """ 对小波变换的系数进行扰动。 """ perturbed_coeffs = [] for coeff in coeffs: # 对细节系数进行扰动,近似系数保持不变 if np.issubdtype(coeff.dtype, np.number): perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape) else: perturbed_coeff = coeff perturbed_coeffs.append(perturbed_coeff) return perturbed_coeffs def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2): # 检查是否存在 'Time' 列,如果存在,则删除 if 'Time' in df.columns: df = df.drop(columns=['Time']) numerical_columns = df.select_dtypes(include=[np.number]).columns extended_data = df.copy() for col in numerical_columns: coeffs = wavelet_transform(df[col], wavelet, level) perturbed_coeffs = perturb_coeffs(coeffs, sigma) reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet) extended_data[col] = reconstructed_series[:len(df[col])] # 保持原数据长度 # 计算扩充的数据量 n_samples = int(len(df) * expansion_ratio) # 扩充数据 expanded_data = extended_data.iloc[-n_samples:].copy() # 合并原始数据和扩充数据 final_data = pd.concat([extended_data, expanded_data], ignore_index=True) return final_data def wavelet_transform(series, wavelet='db1', level=1): return pywt.wavedec(series, wavelet, level=level) def wavelet_reconstruct(coeffs, wavelet='db1'): return pywt.waverec(coeffs, wavelet) def perturb_coeffs(coeffs, sigma): perturbed_coeffs = [] for coeff in coeffs: if np.issubdtype(coeff.dtype, np.number): perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape) else: perturbed_coeff = coeff perturbed_coeffs.append(perturbed_coeff) return perturbed_coeffs def enhance_or_reduce(coeffs, factor): """ 对小波变换后的高频系数进行增强或衰减。 """ enhanced_coeffs = [] for i, coeff in enumerate(coeffs): # 细节系数从索引1开始,我们对其进行增强或衰减 if i > 0: enhanced_coeffs.append(coeff * factor) else: enhanced_coeffs.append(coeff) return enhanced_coeffs def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2): # 检查是否存在 'Time' 列,如果存在,则删除 if 'Time' in df.columns: df = df.drop(columns=['Time']) numerical_columns = df.select_dtypes(include=[np.number]).columns extended_data = df.copy() for col in numerical_columns: coeffs = wavelet_transform(df[col], wavelet, level) perturbed_coeffs = perturb_coeffs(coeffs, sigma) enhanced_coeffs = enhance_or_reduce(perturbed_coeffs, factor=1.1) # 增强高频系数 reconstructed_series = wavelet_reconstruct(enhanced_coeffs, wavelet) extended_data[col] = reconstructed_series # 计算扩充的数据量 n_samples = int(len(df) * expansion_ratio) # 扩充数据 expanded_data = extended_data.iloc[-n_samples:].copy() # 合并原始数据和扩充数据 final_data = pd.concat([df, expanded_data], axis=0, ignore_index=True) return final_data def wavelet_transform(series, wavelet='db1', level=1): """ 对一维数值数据进行小波变换。 """ return pywt.wavedec(series, wavelet, level=level) def wavelet_reconstruct(coeffs, wavelet='db1'): """ 使用小波变换后的系数重构数据。 """ return pywt.waverec(coeffs, wavelet) def perturb_coeffs(coeffs, sigma): """ 对小波变换的系数进行扰动。 """ perturbed_coeffs = [] for coeff in coeffs: if np.issubdtype(coeff.dtype, np.number): perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape) else: perturbed_coeff = coeff perturbed_coeffs.append(perturbed_coeff) return perturbed_coeffs def interpolate_coeffs(coeffs, new_length): """ 对小波变换的系数进行线性插值。 """ interpolated_coeffs = [] for coeff in coeffs: if new_length: coeff = np.interp(np.arange(new_length), np.arange(len(coeff)), coeff) interpolated_coeffs.append(coeff) return interpolated_coeffs def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2, new_length=None): # 检查是否存在 'Time' 列,如果存在,则删除 if 'Time' in df.columns: df = df.drop(columns=['Time']) numerical_columns = df.select_dtypes(include=[np.number]).columns extended_data = df.copy() for col in numerical_columns: coeffs = wavelet_transform(df[col], wavelet, level) perturbed_coeffs = perturb_coeffs(coeffs, sigma) if new_length is not None: perturbed_coeffs = interpolate_coeffs(perturbed_coeffs, new_length) reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet) extended_data[col] = reconstructed_series[:len(df[col])] # 确保数据长度一致 # 计算扩充的数据量 n_samples = int(len(df) * expansion_ratio) # 扩充数据 expanded_data = extended_data.iloc[-n_samples:].copy() # 合并原始数据和扩充数据 final_data = pd.concat([extended_data, expanded_data], ignore_index=True) return final_data @app.route('/random', methods=['POST']) def upload_file1(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) # 调用扩充数据函数 expansion_ratio = 0.2 # 扩充数据的比例 result_data = extend_data_with_ordered_sampling(df, expansion_ratio) # 将结果保存为CSV文件 result_data.to_csv(result_file_path, index=False, header=False) # 构建返回数据,只返回文件路径 return jsonify({ 'code': 200, 'msg': '文件处理完成', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) #扩充2--数据扰动:通过在原始数据上添加小的随机扰动来生成新的数据点。这种方法适用于数值型数据,可以帮助模型学习到更加泛化的特征表示。 @app.route('/perturbation', methods=['POST']) def upload_file2(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') # 读取数据集 data = pd.read_csv('{0}'.format(file_url)) sigma = 0.05 # 扰动的标准差 expansion_ratio = 0.2 # 扩充数据的比例 extended_data = extend_data_with_perturbation(data, sigma, expansion_ratio) extended_data.to_csv(result_file_path, index=False) # 构建返回数据,只返回文件路径 return jsonify({ 'code': 200, 'msg': '文件处理完成', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) #扩充3--Wavelet变换:可以将信号分解成不同频率的子信号,然后可以通过对这些子信号进行处理来实现数据扩充。 @app.route('/Wavelet', methods=['POST']) def upload_file3(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') # 读取数据集 data = pd.read_csv('{0}'.format(file_url)) wavelet = 'db1' # 选择小波基 level = 1 # 分解层数 sigma = 0.05 # 扰动的标准差 expansion_ratio = 0.2 # 扩充数据的比例 extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio) extended_data.to_csv(result_file_path, index=False) # 构建返回数据,只返回文件路径 return jsonify({ 'code': 200, 'msg': '文件处理完成', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) #扩充四-小波系数扰动:对小波变换后的系数进行扰动,即在系数中加入随机噪声。这是通过在系数上加上标准差为sigma的高斯随机数实现的 @app.route('/coefficient', methods=['POST']) def upload_file4(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') # 读取数据集 data = pd.read_csv('{0}'.format(file_url)) wavelet = 'db1' # 选择小波基 level = 1 # 分解层数 sigma = 0.05 # 扰动的标准差 expansion_ratio = 0.2 # 扩充数据的比例 extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio) extended_data.to_csv(result_file_path, index=False) # 构建返回数据,只返回文件路径 return jsonify({ 'code': 200, 'msg': '文件处理完成', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) #扩充5:小波线性插值 @app.route('/interpolation', methods=['POST']) def upload_file5(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') # 读取数据集 data = pd.read_csv('{0}'.format(file_url)) wavelet = 'db1' level = 1 sigma = 0.05 expansion_ratio = 0.2 new_length = None extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio, new_length) extended_data.to_csv(result_file_path, index=False) # 构建返回数据,只返回文件路径 return jsonify({ 'code': 200, 'msg': '文件处理完成', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) if __name__ == '__main__': app.run(debug=True, port=10001, host='0.0.0.0')