from time import time from py2neo import Graph, Node, Relationship, Subgraph import json import pandas as pd import re # 读取Excel文件 def excel_to_json(excel_file, json_file): # 加载Excel文件 data_frame = pd.read_excel(excel_file) # 将DataFrame转换为JSON字符串 json_data = data_frame.to_json(orient='records', force_ascii=False) # 写入到JSON文件 with open(json_file, 'w', encoding='utf-8') as f: f.write(json_data) print(f"转换完成,JSON文件已保存至:{json_file}") # 示例用法 excel_file_path = r'C:\Users\Machenike\Desktop\xn_data.xlsx' # Excel文件路径 json_file_path = r'D:\hiddz\KG_QA\T_Neo\data.json' # 输出的JSON文件路径 excel_to_json(excel_file_path, json_file_path) class FaultGraph: def __getinitargs__(self): self.data_path = json_file_path def read_nodes(self): hmcs = [] # 故障名称 describs = [] # 故障日期 systems = [] # 故障现象 s_systems = [] # 所属系统 excludes = [] # 专业 objs = [] # 故障产品或部位 x_objects = [] # 故障原因 plans = [] # 排除方法 doc_id = [] doc_name = [] fault_infos = [] rels_des = [] rels_sys = [] sys_sys = [] rels_excludes = [] rels_objs = [] rels_obj_obj = [] sys_objs = [] plan_sys = [] count = 0 with open('data.json', "r", encoding='utf-8') as f: data = json.load(f) for i in data: fault_dict = {} count += 1 HMC = str(i['HMC']) fault_dict['HMC'] = HMC fault_dict['故障描述'] = '' fault_dict['维修策略'] = '' fault_dict['系统'] = '' fault_dict['子系统'] = '' fault_dict['成品'] = '' fault_dict['型号'] = '' fault_dict['机型'] = '' fault_dict['文档id'] = '' fault_dict['文档名称'] = '' if 'HMC' in i: fault_dict['HMC'] = i['HMC'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] hmcs.append((i['HMC'], i['文档id'], i['文档名称'])) if '故障描述' in i: fault_dict['故障描述'] = i['故障描述'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] rels_des.append([HMC, i['故障描述']]) describs.append((i['故障描述'], i['文档id'], i['文档名称'])) if '机型' in i: fault_dict['机型'] = i['机型'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] plans.append((i['机型'], i['文档id'], i['文档名称'])) plan_sys.append([i['机型'], i['系统']]) if '系统' in i: fault_dict['系统'] = i['系统'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] systems.append((i['系统'], i['文档id'], i['文档名称'])) rels_sys.append([HMC, i['系统']]) if '子系统' in i: fault_dict['子系统'] = i['子系统'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] if i['子系统'] != None: s_systems.append((i['子系统'], i['文档id'], i['文档名称'])) else: continue sys_sys.append([i['系统'], i['子系统']]) if '维修策略' in i: fault_dict['维修策略'] = i['维修策略'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] excludes.append((i['维修策略'], i['文档id'], i['文档名称'])) rels_excludes.append([HMC, i['维修策略']]) rels_excludes.append([i['故障描述'], i['维修策略']]) if '成品' in i: fault_dict['成品'] = i['成品'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] objs.append((i['成品'], i['文档id'], i['文档名称'])) rels_objs.append([HMC, i['成品']]) sys_objs.append([i['子系统'], i['成品']]) if '型号' in i: fault_dict['型号'] = i['型号'] fault_dict['文档id'] = i['文档id'] fault_dict['文档名称'] = i['文档名称'] x_objects.append((i['型号'], i['文档id'], i['文档名称'])) rels_obj_obj.append([i['成品'], i['型号']]) if '文档id' in i: fault_dict['文档id'] = i['文档id'] doc_id.append(i['文档id']) if '文档名称' in i: fault_dict['文档名称'] = i['文档名称'] doc_name.append(i['文档名称']) fault_infos.append(fault_dict) return set(hmcs), set(describs), set(systems), set(s_systems), set(excludes), set(objs), set(x_objects), set(plans),\ set(doc_id), set(doc_name), fault_infos, rels_des, rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys '''建立节点''' def create_node(self, label, nodes): count = 0 nodess = [] for node_name in nodes: # 检查node_name是否是三元组 if isinstance(node_name, (tuple, list)) and len(node_name) == 3: node = Node(label, name=node_name[0], doc_id=node_name[1], doc_name=node_name[2]) nodess.append(node) count += 1 # 如果是单个字符串,我们可以将其转为一个默认的doc_id和doc_name elif isinstance(node_name, str): node = Node(label, name=node_name, doc_id='default_id', doc_name='default_name') nodess.append(node) count += 1 else: print(f"Warning: Invalid node format {node_name}") return nodess '''创建知识图谱中心故障的节点''' def create_fault_nodes(self, fault_infos): count = 0 nodes = [] for fault_dict in fault_infos: node_properties = Node("Fault", name=fault_dict['HMC'], fault_des=fault_dict['故障描述'], fault_excluds=fault_dict['维修策略'], systems=fault_dict['系统'], s_sys=fault_dict['子系统'], fault_obj=fault_dict['成品'], fault_x_obj=fault_dict['型号'], plans=fault_dict['机型']) node_properties = {k: v for k, v in node_properties.items() if v is not None or v != ''} node = Node('Fault', **node_properties) nodes.append(node) count += 1 return nodes '''创建知识图谱实体节点类型schema''' def create_graphnodes(self): hmcs, describs, systems, s_systems, excludes, objs, x_objects, plans, doc_id, doc_name, fault_infos, rels_des, \ rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys = self.read_nodes() a = self.create_fault_nodes(fault_infos) b = self.create_node('HMC', hmcs) c = self.create_node('故障描述', describs) d = self.create_node('系统', systems) e = self.create_node('子系统', s_systems) f = self.create_node('维修策略', excludes) g = self.create_node('成品', objs) h = self.create_node('型号', x_objects) i = self.create_node('机型', plans) return a + b + c + d + e + f + g + h + i '''创建实体关系边''' def create_graphrels(self): hmcs, describs, systems, s_systems, excludes, objs, x_objects, plans, doc_id, doc_name, fault_infos, rels_des, \ rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys = self.read_nodes() a = self.create_relationship('name', 'describ', rels_des, '故障描述', '故障描述') b = self.create_relationship('name', 'systems', rels_sys, '系统', '系统') c = self.create_relationship('name', 'excludes', rels_excludes, '维修策略', '维修策略') d = self.create_relationship('name', 'object', rels_objs, '成品', '成品') e = self.create_relationship('object', 'x_object', rels_obj_obj, '型号', '型号') f = self.create_relationship('name', 'system', sys_sys, '子系统', '子系统') g = self.create_relationship('name', 'object', sys_objs, '成品', '成品') h = self.create_relationship('plan', 'system', plan_sys, '系统', '系统') return a + b + c + d + e + f + g + h '''创建实体关联边''' def create_relationship(self, start_node, end_node, edges, rel_type, rel_name): count = 0 # 去重处理 relationships = [] set_edges = [] for edge in edges: if len(edge) == 2 and all(isinstance(item, str) for item in edge): set_edges.append('###'.join(edge)) else: print(f'warning: Invaild edge {edge}. Ignoring.') for edge in set(set_edges): edge = edge.split('###') p = edge[0] q = edge[1] query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' merge (p)-[rel:%s{name:'%s'}]->(q)" % ( start_node, end_node, p, q, rel_type, rel_name) relationships.append(query) try: count += 1 except Exception as e: print() return relationships class ProgressBar: def __init__(self, total, name='', mode=0): self.total = total self.name = name modes = ( lambda n: f"进度|{'=' * n}{'>'}{'·' * (100 - n)}"[:-1] + f"| {n}% |", lambda n: f"进度|{'█' * n:100s}| {n}% |", lambda n: f"\033[31m{'♥' * n}{'♡' * (100 - n)} 进度{n}♥\033[0m", lambda n: f"\033[46m进度{' ' * n}{n}% \033[44m{' ' * (100 - n)}\033[0m", ) mode = 0 if mode > 3 else mode self.mode = modes[mode] def now(self, n): if BAR: n_ = 100 * n // self.total print(f"\r{self.name}: {self.mode(n_)} [{n:05d} / {self.total}]", end='', flush=True) def end(self, name): print(name) BAR = True t0 = time() if __name__ == '__main__': handler = FaultGraph() graphnodes = handler.create_graphnodes() graphrels = handler.create_graphrels() print(f"匹配所用时间:{time() - t0:.1f}秒") print('正在创建图谱 . . . . . .', end='') graph = Graph("bolt://localhost:7687", auth=("neo4j", "fdx3081475970")) dicts = {} for i in graphnodes: dicts[i['name']] = i import numpy as np len(np.array(graphrels)) p = ProgressBar(len(np.array(graphrels)), '匹配进度', mode=3) s = 0 relationships = [] for i in graphrels: p.now(s) n = i.split(' ')[2].split("'")[1] if re.match("^\d+$", n): n = int(n) m = i.split("'")[3] rr = i.split('[')[-1].split(']')[0][4:].split("'")[1] r = i.split('[')[-1].split(':')[1].split('{')[0] relationships.append(Relationship(dicts[n], r, dicts[m], name=rr)) s += 1 print(f"匹配所用时间:{time() - t0:.1f}秒") print('数据正在导入数据库......') graph.create(Subgraph(nodes=graphnodes, relationships=relationships)) print('\r知识图谱数据库创建完成 !!') print(f"总体所用时间:{time() - t0:.1f}秒")