188 lines
5.3 KiB
Python
188 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
转写文件合并脚本
|
||
|
||
将指定文件夹中的多个转写txt文件按文件名顺序合并为一个文件,
|
||
在文件边界处添加合并标记,提示前后发言人标识可能不对应。
|
||
|
||
用法:
|
||
python merge_transcripts.py <输入文件夹路径> <输出文件路径>
|
||
|
||
示例:
|
||
python merge_transcripts.py "C:\转写文件" "output\merged_transcript.md"
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
import re
|
||
|
||
|
||
def extract_date_from_filename(filename: str) -> str:
|
||
"""从文件名提取日期,格式如 20251202"""
|
||
match = re.search(r'(\d{8})', filename)
|
||
if match:
|
||
date_str = match.group(1)
|
||
return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
|
||
return None
|
||
|
||
|
||
# 无用画面信息的正则模式
|
||
USELESS_SCREEN_PATTERNS = [
|
||
# 黑屏、变黑相关
|
||
r'^画面内容:.*(?:画面变黑|黑屏|屏幕变黑|黑色背景).*$',
|
||
# 纯粹的鼠标移动(无实质内容)
|
||
r'^画面内容:鼠标(?:光标)?(?:在.*)?移动[。.]?$',
|
||
# 窗口加载中
|
||
r'^画面内容:.*正在加载.*$',
|
||
# 纯白色光点等无意义画面
|
||
r'^画面内容:.*(?:白色光点|光点).*$',
|
||
# 非参会人员账号显示(张媛媛是会议室账号,不是参会人)
|
||
r'^画面内容:.*(?:张媛媛).*$',
|
||
]
|
||
|
||
|
||
def is_useless_screen_content(line: str) -> bool:
|
||
"""判断是否为无用的画面内容"""
|
||
for pattern in USELESS_SCREEN_PATTERNS:
|
||
if re.match(pattern, line.strip()):
|
||
return True
|
||
return False
|
||
|
||
|
||
def merge_transcripts(input_folder: str, output_path: str) -> dict:
|
||
"""
|
||
合并转写文件
|
||
|
||
Args:
|
||
input_folder: 包含转写txt文件的文件夹路径
|
||
output_path: 输出文件路径
|
||
|
||
Returns:
|
||
dict: 包含处理结果信息
|
||
"""
|
||
input_path = Path(input_folder)
|
||
output_file = Path(output_path)
|
||
|
||
# 确保输出目录存在
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 获取所有txt文件并按名称排序
|
||
txt_files = sorted(input_path.glob("*.txt"), key=lambda x: x.name)
|
||
|
||
if not txt_files:
|
||
return {
|
||
"success": False,
|
||
"error": f"未找到txt文件: {input_folder}",
|
||
"file_count": 0
|
||
}
|
||
|
||
# 尝试从文件名提取日期
|
||
meeting_date = None
|
||
for f in txt_files:
|
||
meeting_date = extract_date_from_filename(f.name)
|
||
if meeting_date:
|
||
break
|
||
|
||
if not meeting_date:
|
||
meeting_date = datetime.now().strftime("%Y-%m-%d")
|
||
|
||
# 合并标记模板
|
||
merge_marker = """
|
||
|
||
<!-- ===== 文件合并边界 ===== -->
|
||
<!-- 注意:以下内容来自新文件,发言者编号可能与上文不对应 -->
|
||
<!-- 来源文件:{filename} -->
|
||
|
||
"""
|
||
|
||
# 开始合并
|
||
merged_content = []
|
||
total_lines_removed = 0
|
||
|
||
# 添加文件头
|
||
header = f"""**会议日期**:{meeting_date}
|
||
**参会人员**:待识别
|
||
|
||
---
|
||
|
||
<!-- 来源文件:{txt_files[0].name} -->
|
||
|
||
"""
|
||
merged_content.append(header)
|
||
|
||
for i, txt_file in enumerate(txt_files):
|
||
# 读取文件内容
|
||
with open(txt_file, 'r', encoding='utf-8') as f:
|
||
content = f.read().strip()
|
||
|
||
# 去除空行和无用画面信息
|
||
lines = content.split('\n')
|
||
filtered_lines = []
|
||
removed_count = 0
|
||
for line in lines:
|
||
# 跳过空行
|
||
if not line.strip():
|
||
continue
|
||
# 跳过无用画面信息
|
||
if is_useless_screen_content(line):
|
||
removed_count += 1
|
||
continue
|
||
filtered_lines.append(line)
|
||
content = '\n'.join(filtered_lines)
|
||
total_lines_removed += removed_count
|
||
|
||
if i > 0:
|
||
# 非首个文件,添加合并标记
|
||
merged_content.append(merge_marker.format(filename=txt_file.name))
|
||
|
||
merged_content.append(content)
|
||
|
||
# 写入输出文件
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write('\n'.join(merged_content))
|
||
|
||
return {
|
||
"success": True,
|
||
"output_path": str(output_file.absolute()),
|
||
"file_count": len(txt_files),
|
||
"files": [f.name for f in txt_files],
|
||
"meeting_date": meeting_date,
|
||
"lines_removed": total_lines_removed
|
||
}
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) != 3:
|
||
print("用法: python merge_transcripts.py <输入文件夹路径> <输出文件路径>")
|
||
print("示例: python merge_transcripts.py \"C:\\转写文件\" \"output\\merged.md\"")
|
||
sys.exit(1)
|
||
|
||
input_folder = sys.argv[1]
|
||
output_path = sys.argv[2]
|
||
|
||
if not os.path.isdir(input_folder):
|
||
print(f"错误: 输入路径不是文件夹: {input_folder}")
|
||
sys.exit(1)
|
||
|
||
result = merge_transcripts(input_folder, output_path)
|
||
|
||
if result["success"]:
|
||
print(f"合并成功!")
|
||
print(f" - 处理文件数: {result['file_count']}")
|
||
print(f" - 会议日期: {result['meeting_date']}")
|
||
print(f" - 过滤无用画面: {result['lines_removed']} 行")
|
||
print(f" - 输出路径: {result['output_path']}")
|
||
print(f" - 文件列表:")
|
||
for fname in result['files']:
|
||
print(f" {fname}")
|
||
else:
|
||
print(f"合并失败: {result['error']}")
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|