60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
会议转写文本发言人替换脚本
|
|
用法: python replace_speaker.py <输入文件> [输出文件]
|
|
如果不指定输出文件,将覆盖原文件
|
|
"""
|
|
import sys
|
|
import re
|
|
import os
|
|
|
|
def replace_speakers(input_file, output_file=None):
|
|
"""
|
|
替换发言人名称并删除时间戳
|
|
- 郝倩玉 -> 线下人员
|
|
- . -> 江争达
|
|
- 删除所有发言人后的时间戳 (HH:MM:SS)
|
|
"""
|
|
if output_file is None:
|
|
output_file = input_file
|
|
|
|
# 读取文件
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# 替换发言人(匹配行首的发言人格式)
|
|
# 格式: 发言人: 或 发言人(时间戳): -> 新发言人:
|
|
# 时间戳是可选的
|
|
content = re.sub(r'^郝倩玉(\(\d{2}:\d{2}:\d{2}\))?:', r'线下人员:', content, flags=re.MULTILINE)
|
|
content = re.sub(r'^信通院云大所市场部-张媛媛(\(\d{2}:\d{2}:\d{2}\))?:', r'线下人员:', content, flags=re.MULTILINE)
|
|
content = re.sub(r'^\.(\(\d{2}:\d{2}:\d{2}\))?:', r'江争达:', content, flags=re.MULTILINE)
|
|
|
|
# 删除其他所有发言人后的时间戳
|
|
content = re.sub(r'^([^\n\(]+)\(\d{2}:\d{2}:\d{2}\):', r'\1:', content, flags=re.MULTILINE)
|
|
|
|
# 写入文件
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
print(f"替换完成: {output_file}")
|
|
return output_file
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("用法: python replace_speaker.py <输入文件> [输出文件]")
|
|
print("示例: python replace_speaker.py transcript.txt")
|
|
print(" python replace_speaker.py transcript.txt output.txt")
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2] if len(sys.argv) > 2 else None
|
|
|
|
if not os.path.exists(input_file):
|
|
print(f"错误: 文件不存在 - {input_file}")
|
|
sys.exit(1)
|
|
|
|
replace_speakers(input_file, output_file)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|