123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- from time import time
- from py2neo import Graph, Node, Relationship, Subgraph
- import json
- import pandas as pd
- import re
- # 读取Excel文件
- def excel_to_json(excel_file, json_file):
- # 加载Excel文件
- data_frame = pd.read_excel(excel_file)
- # 将DataFrame转换为JSON字符串
- json_data = data_frame.to_json(orient='records', force_ascii=False)
- # 写入到JSON文件
- with open(json_file, 'w', encoding='utf-8') as f:
- f.write(json_data)
- print(f"转换完成,JSON文件已保存至:{json_file}")
- # 示例用法
- excel_file_path = r'C:\Users\Machenike\Desktop\xn_data.xlsx' # Excel文件路径
- json_file_path = r'D:\hiddz\KG_QA\T_Neo\data.json' # 输出的JSON文件路径
- excel_to_json(excel_file_path, json_file_path)
- class FaultGraph:
- def __getinitargs__(self):
- self.data_path = json_file_path
- def read_nodes(self):
- hmcs = [] # 故障名称
- describs = [] # 故障日期
- systems = [] # 故障现象
- s_systems = [] # 所属系统
- excludes = [] # 专业
- objs = [] # 故障产品或部位
- x_objects = [] # 故障原因
- plans = [] # 排除方法
- doc_id = []
- doc_name = []
- fault_infos = []
- rels_des = []
- rels_sys = []
- sys_sys = []
- rels_excludes = []
- rels_objs = []
- rels_obj_obj = []
- sys_objs = []
- plan_sys = []
- count = 0
- with open('data.json', "r", encoding='utf-8') as f:
- data = json.load(f)
- for i in data:
- fault_dict = {}
- count += 1
- HMC = str(i['HMC'])
- fault_dict['HMC'] = HMC
- fault_dict['故障描述'] = ''
- fault_dict['维修策略'] = ''
- fault_dict['系统'] = ''
- fault_dict['子系统'] = ''
- fault_dict['成品'] = ''
- fault_dict['型号'] = ''
- fault_dict['机型'] = ''
- fault_dict['文档id'] = ''
- fault_dict['文档名称'] = ''
- if 'HMC' in i:
- fault_dict['HMC'] = i['HMC']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- hmcs.append((i['HMC'], i['文档id'], i['文档名称']))
- if '故障描述' in i:
- fault_dict['故障描述'] = i['故障描述']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- rels_des.append([HMC, i['故障描述']])
- describs.append((i['故障描述'], i['文档id'], i['文档名称']))
- if '机型' in i:
- fault_dict['机型'] = i['机型']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- plans.append((i['机型'], i['文档id'], i['文档名称']))
- plan_sys.append([i['机型'], i['系统']])
- if '系统' in i:
- fault_dict['系统'] = i['系统']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- systems.append((i['系统'], i['文档id'], i['文档名称']))
- rels_sys.append([HMC, i['系统']])
- if '子系统' in i:
- fault_dict['子系统'] = i['子系统']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- if i['子系统'] != None:
- s_systems.append((i['子系统'], i['文档id'], i['文档名称']))
- else:
- continue
- sys_sys.append([i['系统'], i['子系统']])
- if '维修策略' in i:
- fault_dict['维修策略'] = i['维修策略']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- excludes.append((i['维修策略'], i['文档id'], i['文档名称']))
- rels_excludes.append([HMC, i['维修策略']])
- rels_excludes.append([i['故障描述'], i['维修策略']])
- if '成品' in i:
- fault_dict['成品'] = i['成品']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- objs.append((i['成品'], i['文档id'], i['文档名称']))
- rels_objs.append([HMC, i['成品']])
- sys_objs.append([i['子系统'], i['成品']])
- if '型号' in i:
- fault_dict['型号'] = i['型号']
- fault_dict['文档id'] = i['文档id']
- fault_dict['文档名称'] = i['文档名称']
- x_objects.append((i['型号'], i['文档id'], i['文档名称']))
- rels_obj_obj.append([i['成品'], i['型号']])
- if '文档id' in i:
- fault_dict['文档id'] = i['文档id']
- doc_id.append(i['文档id'])
- if '文档名称' in i:
- fault_dict['文档名称'] = i['文档名称']
- doc_name.append(i['文档名称'])
- fault_infos.append(fault_dict)
- return set(hmcs), set(describs), set(systems), set(s_systems), set(excludes), set(objs), set(x_objects), set(plans),\
- set(doc_id), set(doc_name), fault_infos, rels_des, rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys
- '''建立节点'''
- def create_node(self, label, nodes):
- count = 0
- nodess = []
- for node_name in nodes:
- # 检查node_name是否是三元组
- if isinstance(node_name, (tuple, list)) and len(node_name) == 3:
- node = Node(label, name=node_name[0], doc_id=node_name[1], doc_name=node_name[2])
- nodess.append(node)
- count += 1
- # 如果是单个字符串,我们可以将其转为一个默认的doc_id和doc_name
- elif isinstance(node_name, str):
- node = Node(label, name=node_name, doc_id='default_id', doc_name='default_name')
- nodess.append(node)
- count += 1
- else:
- print(f"Warning: Invalid node format {node_name}")
- return nodess
- '''创建知识图谱中心故障的节点'''
- def create_fault_nodes(self, fault_infos):
- count = 0
- nodes = []
- for fault_dict in fault_infos:
- node_properties = Node("Fault", name=fault_dict['HMC'], fault_des=fault_dict['故障描述'],
- fault_excluds=fault_dict['维修策略'],
- systems=fault_dict['系统'], s_sys=fault_dict['子系统'], fault_obj=fault_dict['成品'],
- fault_x_obj=fault_dict['型号'],
- plans=fault_dict['机型'])
- node_properties = {k: v for k, v in node_properties.items() if v is not None or v != ''}
- node = Node('Fault', **node_properties)
- nodes.append(node)
- count += 1
- return nodes
- '''创建知识图谱实体节点类型schema'''
- def create_graphnodes(self):
- hmcs, describs, systems, s_systems, excludes, objs, x_objects, plans, doc_id, doc_name, fault_infos, rels_des, \
- rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys = self.read_nodes()
- a = self.create_fault_nodes(fault_infos)
- b = self.create_node('HMC', hmcs)
- c = self.create_node('故障描述', describs)
- d = self.create_node('系统', systems)
- e = self.create_node('子系统', s_systems)
- f = self.create_node('维修策略', excludes)
- g = self.create_node('成品', objs)
- h = self.create_node('型号', x_objects)
- i = self.create_node('机型', plans)
- return a + b + c + d + e + f + g + h + i
- '''创建实体关系边'''
- def create_graphrels(self):
- hmcs, describs, systems, s_systems, excludes, objs, x_objects, plans, doc_id, doc_name, fault_infos, rels_des, \
- rels_sys, sys_sys, rels_excludes, rels_objs, rels_obj_obj, sys_objs, plan_sys = self.read_nodes()
- a = self.create_relationship('name', 'describ', rels_des, '故障描述', '故障描述')
- b = self.create_relationship('name', 'systems', rels_sys, '系统', '系统')
- c = self.create_relationship('name', 'excludes', rels_excludes, '维修策略', '维修策略')
- d = self.create_relationship('name', 'object', rels_objs, '成品', '成品')
- e = self.create_relationship('object', 'x_object', rels_obj_obj, '型号', '型号')
- f = self.create_relationship('name', 'system', sys_sys, '子系统', '子系统')
- g = self.create_relationship('name', 'object', sys_objs, '成品', '成品')
- h = self.create_relationship('plan', 'system', plan_sys, '系统', '系统')
- return a + b + c + d + e + f + g + h
- '''创建实体关联边'''
- def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
- count = 0
- # 去重处理
- relationships = []
- set_edges = []
- for edge in edges:
- if len(edge) == 2 and all(isinstance(item, str) for item in edge):
- set_edges.append('###'.join(edge))
- else:
- print(f'warning: Invaild edge {edge}. Ignoring.')
- for edge in set(set_edges):
- edge = edge.split('###')
- p = edge[0]
- q = edge[1]
- query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' merge (p)-[rel:%s{name:'%s'}]->(q)" % (
- start_node, end_node, p, q, rel_type, rel_name)
- relationships.append(query)
- try:
- count += 1
- except Exception as e:
- print()
- return relationships
- class ProgressBar:
- def __init__(self, total, name='', mode=0):
- self.total = total
- self.name = name
- modes = (
- lambda n: f"进度|{'=' * n}{'>'}{'·' * (100 - n)}"[:-1] + f"| {n}% |",
- lambda n: f"进度|{'█' * n:100s}| {n}% |",
- lambda n: f"\033[31m{'♥' * n}{'♡' * (100 - n)} 进度{n}♥\033[0m",
- lambda n: f"\033[46m进度{' ' * n}{n}% \033[44m{' ' * (100 - n)}\033[0m",
- )
- mode = 0 if mode > 3 else mode
- self.mode = modes[mode]
- def now(self, n):
- if BAR:
- n_ = 100 * n // self.total
- print(f"\r{self.name}: {self.mode(n_)} [{n:05d} / {self.total}]", end='', flush=True)
- def end(self, name):
- print(name)
- BAR = True
- t0 = time()
- if __name__ == '__main__':
- handler = FaultGraph()
- graphnodes = handler.create_graphnodes()
- graphrels = handler.create_graphrels()
- print(f"匹配所用时间:{time() - t0:.1f}秒")
- print('正在创建图谱 . . . . . .', end='')
- graph = Graph("bolt://localhost:7687", auth=("neo4j", "fdx3081475970"))
- dicts = {}
- for i in graphnodes:
- dicts[i['name']] = i
- import numpy as np
- len(np.array(graphrels))
- p = ProgressBar(len(np.array(graphrels)), '匹配进度', mode=3)
- s = 0
- relationships = []
- for i in graphrels:
- p.now(s)
- n = i.split(' ')[2].split("'")[1]
- if re.match("^\d+$", n):
- n = int(n)
- m = i.split("'")[3]
- rr = i.split('[')[-1].split(']')[0][4:].split("'")[1]
- r = i.split('[')[-1].split(':')[1].split('{')[0]
- relationships.append(Relationship(dicts[n], r, dicts[m], name=rr))
- s += 1
- print(f"匹配所用时间:{time() - t0:.1f}秒")
- print('数据正在导入数据库......')
- graph.create(Subgraph(nodes=graphnodes, relationships=relationships))
- print('\r知识图谱数据库创建完成 !!')
- print(f"总体所用时间:{time() - t0:.1f}秒")
|