Signed-off-by: changyunju <2743319061@qq.com>
This commit is contained in:
parent
1e48d34c26
commit
f326b9d818
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
7
.idea/misc.xml
Normal file
7
.idea/misc.xml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.13 (PyCharmMiscProject)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (PyCharmMiscProject)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/PyCharmMiscProject.iml" filepath="$PROJECT_DIR$/PyCharmMiscProject.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
10
PyCharmMiscProject.iml
Normal file
10
PyCharmMiscProject.iml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.13 (PyCharmMiscProject)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
1
knowledge_graph.json
Normal file
1
knowledge_graph.json
Normal file
File diff suppressed because one or more lines are too long
118
script.py
Normal file
118
script.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
import json
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
# 尝试使用不同的编码方式读取CSV文件
|
||||
encodings = ['gbk', 'gb18030', 'utf-8', 'latin1']
|
||||
df = None
|
||||
file_path = 'edge.csv'
|
||||
|
||||
try:
|
||||
for encoding in encodings:
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding=encoding, header=None)
|
||||
print(f"使用 {encoding} 编码成功读取文件。")
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
if df is None:
|
||||
raise ValueError("无法使用任何支持的编码读取文件。")
|
||||
except Exception as e:
|
||||
if isinstance(e, FileNotFoundError):
|
||||
print(f"错误: 请确保文件 '{file_path}' 存在于当前目录中。")
|
||||
sys.exit(f"读取文件时出错: {str(e)}")
|
||||
|
||||
# 检查列数
|
||||
if len(df.columns) < 13:
|
||||
sys.exit(f"错误: CSV文件需要至少13列,当前只有 {len(df.columns)} 列。")
|
||||
|
||||
# --- 数据预处理与抽样 ---
|
||||
|
||||
# 1. 跳过第一行 (表头),并重置索引
|
||||
df = df.iloc[1:].reset_index(drop=True)
|
||||
print(f"原始数据共 {len(df)} 条。")
|
||||
|
||||
# 2. 从第M列(索引12)的时间戳中提取日期部分,用于分组
|
||||
# 'coerce'会将无法转换的格式变为NaT(Not a Time),便于后续清理
|
||||
df['date_only'] = pd.to_datetime(df[12], errors='coerce').dt.date
|
||||
|
||||
# 3. 删除日期转换失败的行
|
||||
df.dropna(subset=['date_only'], inplace=True)
|
||||
|
||||
# 4. 按日期分组,并对每个组进行随机抽样
|
||||
# - 对每个日期分组(x),如果该组数据量大于等于10,则随机抽10条。
|
||||
# - 如果小于10,则全部保留。
|
||||
sampled_df = df.groupby('date_only').apply(lambda x: x.sample(n=min(len(x), 10))).reset_index(drop=True)
|
||||
|
||||
print(f"按天随机抽样后,保留 {len(sampled_df)} 条数据进行处理。")
|
||||
|
||||
# --- JSON图谱生成 ---
|
||||
|
||||
graph = {
|
||||
"nodes": [],
|
||||
"links": []
|
||||
}
|
||||
nodes_added = set()
|
||||
media_accounts = [
|
||||
'人民网', '央视新闻', '新华社', '环球时报', '军武吐槽',
|
||||
'观察者网', '环球网', '中国日报', '南方周末', '新京报',
|
||||
'澎湃新闻', '界面新闻', '财新网', '经济日报', '光明日报'
|
||||
]
|
||||
|
||||
# 遍历抽样后的数据
|
||||
for idx, row in sampled_df.iterrows():
|
||||
try:
|
||||
# 提取数据
|
||||
source_name = str(row[0]) # A列: Source Name (也用作其ID)
|
||||
target_name = str(row[1]) # B列: Target's display name
|
||||
target_id = str(int(float(row[3]))) # D列: Target's unique ID
|
||||
timestamp = str(row[12]) # M列: Timestamp
|
||||
link_type = "user cite user"
|
||||
|
||||
# 处理源节点 (如果不存在)
|
||||
source_id = source_name
|
||||
if source_id not in nodes_added:
|
||||
graph["nodes"].append({
|
||||
"id": source_id,
|
||||
"type": "media" if source_name in media_accounts else "user",
|
||||
"properties": {"name": source_name},
|
||||
"personProperties": {"url": "src/views/user/default.png"}
|
||||
})
|
||||
nodes_added.add(source_id)
|
||||
|
||||
# 处理目标节点 (如果不存在)
|
||||
if target_id not in nodes_added:
|
||||
node_properties = {"name": target_name}
|
||||
person_properties = {"url": "src/views/user/default.png"}
|
||||
|
||||
if target_id == '0':
|
||||
node_properties["special_type"] = ["initial"]
|
||||
person_properties["url"] = "src/views/user/boss.png"
|
||||
|
||||
graph["nodes"].append({
|
||||
"id": target_id,
|
||||
"type": "user",
|
||||
"properties": node_properties,
|
||||
"personProperties": person_properties
|
||||
})
|
||||
nodes_added.add(target_id)
|
||||
|
||||
# 创建链接,target指向节点的ID
|
||||
graph["links"].append({
|
||||
"source": source_id,
|
||||
"target": target_name,
|
||||
"type": link_type,
|
||||
"properties": None,
|
||||
"ranks": [timestamp]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理第 {idx + 1} 行时出错: {str(e)}")
|
||||
|
||||
# --- 保存结果 ---
|
||||
output_file = "knowledge_graph.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
# 生成不换行的紧凑JSON
|
||||
json.dump(graph, f, ensure_ascii=False)
|
||||
|
||||
print(f"\n转换完成!结果已作为单行JSON保存至: {output_file}")
|
||||
Loading…
Reference in New Issue
Block a user