Signed-off-by: changyunju <2743319061@qq.com>

This commit is contained in:
changyunju 2025-06-14 17:44:24 +08:00
parent 1e48d34c26
commit f326b9d818
9 changed files with 97070 additions and 0 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.13 (PyCharmMiscProject)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (PyCharmMiscProject)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/PyCharmMiscProject.iml" filepath="$PROJECT_DIR$/PyCharmMiscProject.iml" />
</modules>
</component>
</project>

10
PyCharmMiscProject.iml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 (PyCharmMiscProject)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

96918
edge.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
edge.txt Normal file

Binary file not shown.

BIN
edge.xlsx Normal file

Binary file not shown.

1
knowledge_graph.json Normal file

File diff suppressed because one or more lines are too long

118
script.py Normal file
View File

@ -0,0 +1,118 @@
import json
import pandas as pd
import sys
# 尝试使用不同的编码方式读取CSV文件
encodings = ['gbk', 'gb18030', 'utf-8', 'latin1']
df = None
file_path = 'edge.csv'
try:
for encoding in encodings:
try:
df = pd.read_csv(file_path, encoding=encoding, header=None)
print(f"使用 {encoding} 编码成功读取文件。")
break
except UnicodeDecodeError:
continue
if df is None:
raise ValueError("无法使用任何支持的编码读取文件。")
except Exception as e:
if isinstance(e, FileNotFoundError):
print(f"错误: 请确保文件 '{file_path}' 存在于当前目录中。")
sys.exit(f"读取文件时出错: {str(e)}")
# 检查列数
if len(df.columns) < 13:
sys.exit(f"错误: CSV文件需要至少13列当前只有 {len(df.columns)} 列。")
# --- 数据预处理与抽样 ---
# 1. 跳过第一行 (表头),并重置索引
df = df.iloc[1:].reset_index(drop=True)
print(f"原始数据共 {len(df)} 条。")
# 2. 从第M列(索引12)的时间戳中提取日期部分,用于分组
# 'coerce'会将无法转换的格式变为NaT(Not a Time),便于后续清理
df['date_only'] = pd.to_datetime(df[12], errors='coerce').dt.date
# 3. 删除日期转换失败的行
df.dropna(subset=['date_only'], inplace=True)
# 4. 按日期分组,并对每个组进行随机抽样
# - 对每个日期分组(x)如果该组数据量大于等于10则随机抽10条。
# - 如果小于10则全部保留。
sampled_df = df.groupby('date_only').apply(lambda x: x.sample(n=min(len(x), 10))).reset_index(drop=True)
print(f"按天随机抽样后,保留 {len(sampled_df)} 条数据进行处理。")
# --- JSON图谱生成 ---
graph = {
"nodes": [],
"links": []
}
nodes_added = set()
media_accounts = [
'人民网', '央视新闻', '新华社', '环球时报', '军武吐槽',
'观察者网', '环球网', '中国日报', '南方周末', '新京报',
'澎湃新闻', '界面新闻', '财新网', '经济日报', '光明日报'
]
# 遍历抽样后的数据
for idx, row in sampled_df.iterrows():
try:
# 提取数据
source_name = str(row[0]) # A列: Source Name (也用作其ID)
target_name = str(row[1]) # B列: Target's display name
target_id = str(int(float(row[3]))) # D列: Target's unique ID
timestamp = str(row[12]) # M列: Timestamp
link_type = "user cite user"
# 处理源节点 (如果不存在)
source_id = source_name
if source_id not in nodes_added:
graph["nodes"].append({
"id": source_id,
"type": "media" if source_name in media_accounts else "user",
"properties": {"name": source_name},
"personProperties": {"url": "src/views/user/default.png"}
})
nodes_added.add(source_id)
# 处理目标节点 (如果不存在)
if target_id not in nodes_added:
node_properties = {"name": target_name}
person_properties = {"url": "src/views/user/default.png"}
if target_id == '0':
node_properties["special_type"] = ["initial"]
person_properties["url"] = "src/views/user/boss.png"
graph["nodes"].append({
"id": target_id,
"type": "user",
"properties": node_properties,
"personProperties": person_properties
})
nodes_added.add(target_id)
# 创建链接target指向节点的ID
graph["links"].append({
"source": source_id,
"target": target_name,
"type": link_type,
"properties": None,
"ranks": [timestamp]
})
except Exception as e:
print(f"处理第 {idx + 1} 行时出错: {str(e)}")
# --- 保存结果 ---
output_file = "knowledge_graph.json"
with open(output_file, "w", encoding="utf-8") as f:
# 生成不换行的紧凑JSON
json.dump(graph, f, ensure_ascii=False)
print(f"\n转换完成结果已作为单行JSON保存至: {output_file}")