Signed-off-by: changyunju <2743319061@qq.com>

2025-06-14 17:44:24 +08:00 · 2025-06-14 17:44:24 +08:00 · f326b9d818
commit f326b9d818
parent 1e48d34c26
9 changed files with 97070 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (PyCharmMiscProject)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (PyCharmMiscProject)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/PyCharmMiscProject.iml" filepath="$PROJECT_DIR$/PyCharmMiscProject.iml" />
+    </modules>
+  </component>
+</project>
--- a/PyCharmMiscProject.iml
+++ b/PyCharmMiscProject.iml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (PyCharmMiscProject)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/edge.csv
+++ b/edge.csv
--- a/edge.txt
+++ b/edge.txt
--- a/edge.xlsx
+++ b/edge.xlsx
--- a/knowledge_graph.json
+++ b/knowledge_graph.json
--- a/script.py
+++ b/script.py
@ -0,0 +1,118 @@
+import json
+import pandas as pd
+import sys
+
+# 尝试使用不同的编码方式读取CSV文件
+encodings = ['gbk', 'gb18030', 'utf-8', 'latin1']
+df = None
+file_path = 'edge.csv'
+
+try:
+    for encoding in encodings:
+        try:
+            df = pd.read_csv(file_path, encoding=encoding, header=None)
+            print(f"使用 {encoding} 编码成功读取文件。")
+            break
+        except UnicodeDecodeError:
+            continue
+    if df is None:
+        raise ValueError("无法使用任何支持的编码读取文件。")
+except Exception as e:
+    if isinstance(e, FileNotFoundError):
+        print(f"错误: 请确保文件 '{file_path}' 存在于当前目录中。")
+    sys.exit(f"读取文件时出错: {str(e)}")
+
+# 检查列数
+if len(df.columns) < 13:
+    sys.exit(f"错误: CSV文件需要至少13列，当前只有 {len(df.columns)} 列。")
+
+# --- 数据预处理与抽样 ---
+
+# 1. 跳过第一行 (表头)，并重置索引
+df = df.iloc[1:].reset_index(drop=True)
+print(f"原始数据共 {len(df)} 条。")
+
+# 2. 从第M列(索引12)的时间戳中提取日期部分，用于分组
+# 'coerce'会将无法转换的格式变为NaT(Not a Time)，便于后续清理
+df['date_only'] = pd.to_datetime(df[12], errors='coerce').dt.date
+
+# 3. 删除日期转换失败的行
+df.dropna(subset=['date_only'], inplace=True)
+
+# 4. 按日期分组，并对每个组进行随机抽样
+#    - 对每个日期分组(x)，如果该组数据量大于等于10，则随机抽10条。
+#    - 如果小于10，则全部保留。
+sampled_df = df.groupby('date_only').apply(lambda x: x.sample(n=min(len(x), 10))).reset_index(drop=True)
+
+print(f"按天随机抽样后，保留 {len(sampled_df)} 条数据进行处理。")
+
+# --- JSON图谱生成 ---
+
+graph = {
+    "nodes": [],
+    "links": []
+}
+nodes_added = set()
+media_accounts = [
+    '人民网', '央视新闻', '新华社', '环球时报', '军武吐槽',
+    '观察者网', '环球网', '中国日报', '南方周末', '新京报',
+    '澎湃新闻', '界面新闻', '财新网', '经济日报', '光明日报'
+]
+
+# 遍历抽样后的数据
+for idx, row in sampled_df.iterrows():
+    try:
+        # 提取数据
+        source_name = str(row[0])  # A列: Source Name (也用作其ID)
+        target_name = str(row[1])  # B列: Target's display name
+        target_id = str(int(float(row[3])))  # D列: Target's unique ID
+        timestamp = str(row[12])  # M列: Timestamp
+        link_type = "user cite user"
+
+        # 处理源节点 (如果不存在)
+        source_id = source_name
+        if source_id not in nodes_added:
+            graph["nodes"].append({
+                "id": source_id,
+                "type": "media" if source_name in media_accounts else "user",
+                "properties": {"name": source_name},
+                "personProperties": {"url": "src/views/user/default.png"}
+            })
+            nodes_added.add(source_id)
+
+        # 处理目标节点 (如果不存在)
+        if target_id not in nodes_added:
+            node_properties = {"name": target_name}
+            person_properties = {"url": "src/views/user/default.png"}
+
+            if target_id == '0':
+                node_properties["special_type"] = ["initial"]
+                person_properties["url"] = "src/views/user/boss.png"
+
+            graph["nodes"].append({
+                "id": target_id,
+                "type": "user",
+                "properties": node_properties,
+                "personProperties": person_properties
+            })
+            nodes_added.add(target_id)
+
+        # 创建链接，target指向节点的ID
+        graph["links"].append({
+            "source": source_id,
+            "target": target_name,
+            "type": link_type,
+            "properties": None,
+            "ranks": [timestamp]
+        })
+
+    except Exception as e:
+        print(f"处理第 {idx + 1} 行时出错: {str(e)}")
+
+# --- 保存结果 ---
+output_file = "knowledge_graph.json"
+with open(output_file, "w", encoding="utf-8") as f:
+    # 生成不换行的紧凑JSON
+    json.dump(graph, f, ensure_ascii=False)
+
+print(f"\n转换完成！结果已作为单行JSON保存至: {output_file}")