Files
AIEC_Skills/.claude/skills/transcript-cleaner-v1/scripts/merge_transcripts.py
2025-12-11 14:19:36 +08:00

188 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
转写文件合并脚本
将指定文件夹中的多个转写txt文件按文件名顺序合并为一个文件
在文件边界处添加合并标记,提示前后发言人标识可能不对应。
用法:
python merge_transcripts.py <输入文件夹路径> <输出文件路径>
示例:
python merge_transcripts.py "C:\转写文件" "output\merged_transcript.md"
"""
import sys
import os
from pathlib import Path
from datetime import datetime
import re
def extract_date_from_filename(filename: str) -> str:
"""从文件名提取日期,格式如 20251202"""
match = re.search(r'(\d{8})', filename)
if match:
date_str = match.group(1)
return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
return None
# 无用画面信息的正则模式
USELESS_SCREEN_PATTERNS = [
# 黑屏、变黑相关
r'^画面内容:.*(?:画面变黑|黑屏|屏幕变黑|黑色背景).*$',
# 纯粹的鼠标移动(无实质内容)
r'^画面内容:鼠标(?:光标)?(?:在.*)?移动[。.]?$',
# 窗口加载中
r'^画面内容:.*正在加载.*$',
# 纯白色光点等无意义画面
r'^画面内容:.*(?:白色光点|光点).*$',
# 非参会人员账号显示(张媛媛是会议室账号,不是参会人)
r'^画面内容:.*(?:张媛媛).*$',
]
def is_useless_screen_content(line: str) -> bool:
"""判断是否为无用的画面内容"""
for pattern in USELESS_SCREEN_PATTERNS:
if re.match(pattern, line.strip()):
return True
return False
def merge_transcripts(input_folder: str, output_path: str) -> dict:
"""
合并转写文件
Args:
input_folder: 包含转写txt文件的文件夹路径
output_path: 输出文件路径
Returns:
dict: 包含处理结果信息
"""
input_path = Path(input_folder)
output_file = Path(output_path)
# 确保输出目录存在
output_file.parent.mkdir(parents=True, exist_ok=True)
# 获取所有txt文件并按名称排序
txt_files = sorted(input_path.glob("*.txt"), key=lambda x: x.name)
if not txt_files:
return {
"success": False,
"error": f"未找到txt文件: {input_folder}",
"file_count": 0
}
# 尝试从文件名提取日期
meeting_date = None
for f in txt_files:
meeting_date = extract_date_from_filename(f.name)
if meeting_date:
break
if not meeting_date:
meeting_date = datetime.now().strftime("%Y-%m-%d")
# 合并标记模板
merge_marker = """
<!-- ===== 文件合并边界 ===== -->
<!-- 注意:以下内容来自新文件,发言者编号可能与上文不对应 -->
<!-- 来源文件:{filename} -->
"""
# 开始合并
merged_content = []
total_lines_removed = 0
# 添加文件头
header = f"""**会议日期**{meeting_date}
**参会人员**:待识别
---
<!-- 来源文件:{txt_files[0].name} -->
"""
merged_content.append(header)
for i, txt_file in enumerate(txt_files):
# 读取文件内容
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read().strip()
# 去除空行和无用画面信息
lines = content.split('\n')
filtered_lines = []
removed_count = 0
for line in lines:
# 跳过空行
if not line.strip():
continue
# 跳过无用画面信息
if is_useless_screen_content(line):
removed_count += 1
continue
filtered_lines.append(line)
content = '\n'.join(filtered_lines)
total_lines_removed += removed_count
if i > 0:
# 非首个文件,添加合并标记
merged_content.append(merge_marker.format(filename=txt_file.name))
merged_content.append(content)
# 写入输出文件
with open(output_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(merged_content))
return {
"success": True,
"output_path": str(output_file.absolute()),
"file_count": len(txt_files),
"files": [f.name for f in txt_files],
"meeting_date": meeting_date,
"lines_removed": total_lines_removed
}
def main():
if len(sys.argv) != 3:
print("用法: python merge_transcripts.py <输入文件夹路径> <输出文件路径>")
print("示例: python merge_transcripts.py \"C:\\转写文件\" \"output\\merged.md\"")
sys.exit(1)
input_folder = sys.argv[1]
output_path = sys.argv[2]
if not os.path.isdir(input_folder):
print(f"错误: 输入路径不是文件夹: {input_folder}")
sys.exit(1)
result = merge_transcripts(input_folder, output_path)
if result["success"]:
print(f"合并成功!")
print(f" - 处理文件数: {result['file_count']}")
print(f" - 会议日期: {result['meeting_date']}")
print(f" - 过滤无用画面: {result['lines_removed']}")
print(f" - 输出路径: {result['output_path']}")
print(f" - 文件列表:")
for fname in result['files']:
print(f" {fname}")
else:
print(f"合并失败: {result['error']}")
sys.exit(1)
if __name__ == "__main__":
main()