188 lines
5.3 KiB
Python
188 lines
5.3 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
转写文件合并脚本
|
|||
|
|
|
|||
|
|
将指定文件夹中的多个转写txt文件按文件名顺序合并为一个文件,
|
|||
|
|
在文件边界处添加合并标记,提示前后发言人标识可能不对应。
|
|||
|
|
|
|||
|
|
用法:
|
|||
|
|
python merge_transcripts.py <输入文件夹路径> <输出文件路径>
|
|||
|
|
|
|||
|
|
示例:
|
|||
|
|
python merge_transcripts.py "C:\转写文件" "output\merged_transcript.md"
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
from datetime import datetime
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_date_from_filename(filename: str) -> str:
|
|||
|
|
"""从文件名提取日期,格式如 20251202"""
|
|||
|
|
match = re.search(r'(\d{8})', filename)
|
|||
|
|
if match:
|
|||
|
|
date_str = match.group(1)
|
|||
|
|
return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 无用画面信息的正则模式
|
|||
|
|
USELESS_SCREEN_PATTERNS = [
|
|||
|
|
# 黑屏、变黑相关
|
|||
|
|
r'^画面内容:.*(?:画面变黑|黑屏|屏幕变黑|黑色背景).*$',
|
|||
|
|
# 纯粹的鼠标移动(无实质内容)
|
|||
|
|
r'^画面内容:鼠标(?:光标)?(?:在.*)?移动[。.]?$',
|
|||
|
|
# 窗口加载中
|
|||
|
|
r'^画面内容:.*正在加载.*$',
|
|||
|
|
# 纯白色光点等无意义画面
|
|||
|
|
r'^画面内容:.*(?:白色光点|光点).*$',
|
|||
|
|
# 非参会人员账号显示(张媛媛是会议室账号,不是参会人)
|
|||
|
|
r'^画面内容:.*(?:张媛媛).*$',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_useless_screen_content(line: str) -> bool:
|
|||
|
|
"""判断是否为无用的画面内容"""
|
|||
|
|
for pattern in USELESS_SCREEN_PATTERNS:
|
|||
|
|
if re.match(pattern, line.strip()):
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def merge_transcripts(input_folder: str, output_path: str) -> dict:
|
|||
|
|
"""
|
|||
|
|
合并转写文件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
input_folder: 包含转写txt文件的文件夹路径
|
|||
|
|
output_path: 输出文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 包含处理结果信息
|
|||
|
|
"""
|
|||
|
|
input_path = Path(input_folder)
|
|||
|
|
output_file = Path(output_path)
|
|||
|
|
|
|||
|
|
# 确保输出目录存在
|
|||
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
# 获取所有txt文件并按名称排序
|
|||
|
|
txt_files = sorted(input_path.glob("*.txt"), key=lambda x: x.name)
|
|||
|
|
|
|||
|
|
if not txt_files:
|
|||
|
|
return {
|
|||
|
|
"success": False,
|
|||
|
|
"error": f"未找到txt文件: {input_folder}",
|
|||
|
|
"file_count": 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 尝试从文件名提取日期
|
|||
|
|
meeting_date = None
|
|||
|
|
for f in txt_files:
|
|||
|
|
meeting_date = extract_date_from_filename(f.name)
|
|||
|
|
if meeting_date:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if not meeting_date:
|
|||
|
|
meeting_date = datetime.now().strftime("%Y-%m-%d")
|
|||
|
|
|
|||
|
|
# 合并标记模板
|
|||
|
|
merge_marker = """
|
|||
|
|
|
|||
|
|
<!-- ===== 文件合并边界 ===== -->
|
|||
|
|
<!-- 注意:以下内容来自新文件,发言者编号可能与上文不对应 -->
|
|||
|
|
<!-- 来源文件:{filename} -->
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
# 开始合并
|
|||
|
|
merged_content = []
|
|||
|
|
total_lines_removed = 0
|
|||
|
|
|
|||
|
|
# 添加文件头
|
|||
|
|
header = f"""**会议日期**:{meeting_date}
|
|||
|
|
**参会人员**:待识别
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
<!-- 来源文件:{txt_files[0].name} -->
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
merged_content.append(header)
|
|||
|
|
|
|||
|
|
for i, txt_file in enumerate(txt_files):
|
|||
|
|
# 读取文件内容
|
|||
|
|
with open(txt_file, 'r', encoding='utf-8') as f:
|
|||
|
|
content = f.read().strip()
|
|||
|
|
|
|||
|
|
# 去除空行和无用画面信息
|
|||
|
|
lines = content.split('\n')
|
|||
|
|
filtered_lines = []
|
|||
|
|
removed_count = 0
|
|||
|
|
for line in lines:
|
|||
|
|
# 跳过空行
|
|||
|
|
if not line.strip():
|
|||
|
|
continue
|
|||
|
|
# 跳过无用画面信息
|
|||
|
|
if is_useless_screen_content(line):
|
|||
|
|
removed_count += 1
|
|||
|
|
continue
|
|||
|
|
filtered_lines.append(line)
|
|||
|
|
content = '\n'.join(filtered_lines)
|
|||
|
|
total_lines_removed += removed_count
|
|||
|
|
|
|||
|
|
if i > 0:
|
|||
|
|
# 非首个文件,添加合并标记
|
|||
|
|
merged_content.append(merge_marker.format(filename=txt_file.name))
|
|||
|
|
|
|||
|
|
merged_content.append(content)
|
|||
|
|
|
|||
|
|
# 写入输出文件
|
|||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write('\n'.join(merged_content))
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"success": True,
|
|||
|
|
"output_path": str(output_file.absolute()),
|
|||
|
|
"file_count": len(txt_files),
|
|||
|
|
"files": [f.name for f in txt_files],
|
|||
|
|
"meeting_date": meeting_date,
|
|||
|
|
"lines_removed": total_lines_removed
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
if len(sys.argv) != 3:
|
|||
|
|
print("用法: python merge_transcripts.py <输入文件夹路径> <输出文件路径>")
|
|||
|
|
print("示例: python merge_transcripts.py \"C:\\转写文件\" \"output\\merged.md\"")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
input_folder = sys.argv[1]
|
|||
|
|
output_path = sys.argv[2]
|
|||
|
|
|
|||
|
|
if not os.path.isdir(input_folder):
|
|||
|
|
print(f"错误: 输入路径不是文件夹: {input_folder}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
result = merge_transcripts(input_folder, output_path)
|
|||
|
|
|
|||
|
|
if result["success"]:
|
|||
|
|
print(f"合并成功!")
|
|||
|
|
print(f" - 处理文件数: {result['file_count']}")
|
|||
|
|
print(f" - 会议日期: {result['meeting_date']}")
|
|||
|
|
print(f" - 过滤无用画面: {result['lines_removed']} 行")
|
|||
|
|
print(f" - 输出路径: {result['output_path']}")
|
|||
|
|
print(f" - 文件列表:")
|
|||
|
|
for fname in result['files']:
|
|||
|
|
print(f" {fname}")
|
|||
|
|
else:
|
|||
|
|
print(f"合并失败: {result['error']}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|