Initial commit: skills library

- 70 skills with code and documentation
- Add .gitignore (ignore __pycache__, output/, temp/, venv/)
- Clean up test intermediates and caches
This commit is contained in:
hmo
2026-04-26 19:27:40 +08:00
commit 04db423416
861 changed files with 210414 additions and 0 deletions
@@ -0,0 +1,321 @@
"""
视频内容分析框架
用于分析心理学/恋爱技巧类视频内容
"""
import json
from pathlib import Path
class VideoContentAnalyzer:
def __init__(self, video_title):
self.video_title = video_title
self.analysis = {
"title": video_title,
"category": self._determine_category(),
"key_concepts": [],
"core_principles": [],
"practical_techniques": [],
"psychological_insights": [],
"controversial_points": [],
"ethical_considerations": [],
"key_quotes": [],
"summary": "",
}
def _determine_category(self):
"""根据标题确定视频类别"""
title_lower = self.video_title.lower()
categories = {
"relationship_psychology": ["", "恋爱", "女人", "男人", "感情", "关系"],
"self_improvement": ["方法", "技巧", "提升", "改变"],
"controversial": ["", "强行", "套路", "操控"],
"educational": ["心理学", "心理", "科学", "研究"],
}
detected = []
for cat, keywords in categories.items():
for kw in keywords:
if kw in title_lower:
detected.append(cat)
break
return detected if detected else ["unknown"]
def analyze_transcript(self, transcript_text):
"""分析转录文本"""
print(f"分析视频: {self.video_title}")
print(f"类别: {', '.join(self.analysis['category'])}")
print(f"转录长度: {len(transcript_text)} 字符")
# 提取关键概念
self._extract_key_concepts(transcript_text)
# 提取核心原则
self._extract_core_principles(transcript_text)
# 提取实用技巧
self._extract_practical_techniques(transcript_text)
# 提取心理学洞察
self._extract_psychological_insights(transcript_text)
# 识别争议点
self._identify_controversial_points(transcript_text)
# 伦理考量
self._analyze_ethical_considerations(transcript_text)
# 提取关键引述
self._extract_key_quotes(transcript_text)
# 生成总结
self._generate_summary(transcript_text)
return self.analysis
def _extract_key_concepts(self, text):
"""提取关键概念"""
# 这里可以添加更复杂的NLP处理
concepts = []
# 简单关键词提取(实际应用中应该用更复杂的方法)
concept_keywords = [
"吸引力",
"价值",
"框架",
"需求感",
"投资",
"服从性",
"筛选",
"推拉",
"冷读",
"心锚",
]
for concept in concept_keywords:
if concept in text:
concepts.append(concept)
self.analysis["key_concepts"] = concepts
def _extract_core_principles(self, text):
"""提取核心原则"""
# 寻找原则性陈述
principle_indicators = [
"原则是",
"核心是",
"关键在于",
"最重要的是",
"本质是",
"根本在于",
]
principles = []
lines = text.split("")
for line in lines:
for indicator in principle_indicators:
if indicator in line:
principles.append(line.strip())
break
self.analysis["core_principles"] = principles[:5] # 取前5个
def _extract_practical_techniques(self, text):
"""提取实用技巧"""
technique_indicators = ["方法", "技巧", "步骤", "操作", "做法", "策略", "战术"]
techniques = []
lines = text.split("")
for line in lines:
for indicator in technique_indicators:
if indicator in line and len(line) < 100: # 避免太长的句子
techniques.append(line.strip())
break
self.analysis["practical_techniques"] = techniques[:10] # 取前10个
def _extract_psychological_insights(self, text):
"""提取心理学洞察"""
insight_indicators = [
"心理学",
"心理",
"潜意识",
"认知",
"情绪",
"动机",
"需求",
"人性",
]
insights = []
lines = text.split("")
for line in lines:
for indicator in insight_indicators:
if indicator in line:
insights.append(line.strip())
break
self.analysis["psychological_insights"] = insights[:8]
def _identify_controversial_points(self, text):
"""识别争议点"""
controversial_indicators = [
"",
"强行",
"操控",
"套路",
"欺骗",
"利用",
"不道德",
"争议",
]
points = []
lines = text.split("")
for line in lines:
for indicator in controversial_indicators:
if indicator in line:
points.append(line.strip())
break
self.analysis["controversial_points"] = points
def _analyze_ethical_considerations(self, text):
"""分析伦理考量"""
ethical_indicators = [
"尊重",
"真诚",
"诚实",
"道德",
"伦理",
"责任",
"伤害",
"欺骗",
]
considerations = []
lines = text.split("")
for line in lines:
for indicator in ethical_indicators:
if indicator in line:
considerations.append(line.strip())
break
self.analysis["ethical_considerations"] = considerations
def _extract_key_quotes(self, text):
"""提取关键引述"""
# 寻找可能的重要陈述
lines = text.split("")
quotes = []
for line in lines:
line = line.strip()
if len(line) > 20 and len(line) < 150: # 适中的长度
# 检查是否包含重要关键词
important_words = [
"",
"感情",
"关系",
"心理",
"方法",
"技巧",
"价值",
"吸引",
]
if any(word in line for word in important_words):
quotes.append(line)
self.analysis["key_quotes"] = quotes[:5]
def _generate_summary(self, text):
"""生成总结"""
# 简单的总结生成(实际应用中应该用LLM)
summary = f"视频《{self.video_title}》主要探讨了"
if self.analysis["key_concepts"]:
summary += f"关于{', '.join(self.analysis['key_concepts'][:3])}等概念"
if self.analysis["practical_techniques"]:
summary += f",提出了{len(self.analysis['practical_techniques'])}个实用技巧"
if self.analysis["controversial_points"]:
summary += f",其中包含一些具有争议性的观点"
if self.analysis["ethical_considerations"]:
summary += f",同时也涉及伦理考量"
summary += ""
self.analysis["summary"] = summary
def save_analysis(self, output_path):
"""保存分析结果"""
with open(output_path, "w", encoding="utf-8") as f:
json.dump(self.analysis, f, ensure_ascii=False, indent=2)
print(f"分析结果已保存到: {output_path}")
def print_analysis(self):
"""打印分析结果"""
print("\n" + "=" * 60)
print("视频内容分析报告")
print("=" * 60)
print(f"\n📺 视频标题: {self.analysis['title']}")
print(f"📂 类别: {', '.join(self.analysis['category'])}")
print(f"📝 总结: {self.analysis['summary']}")
print(f"\n🔑 关键概念 ({len(self.analysis['key_concepts'])}个):")
for concept in self.analysis["key_concepts"]:
print(f"{concept}")
print(f"\n🎯 核心原则 ({len(self.analysis['core_principles'])}个):")
for i, principle in enumerate(self.analysis["core_principles"], 1):
print(f" {i}. {principle}")
print(f"\n🛠️ 实用技巧 ({len(self.analysis['practical_techniques'])}个):")
for i, technique in enumerate(self.analysis["practical_techniques"][:5], 1):
print(f" {i}. {technique}")
if len(self.analysis["practical_techniques"]) > 5:
print(f" ... 还有{len(self.analysis['practical_techniques']) - 5}个技巧")
print(f"\n🧠 心理学洞察 ({len(self.analysis['psychological_insights'])}个):")
for i, insight in enumerate(self.analysis["psychological_insights"][:3], 1):
print(f" {i}. {insight}")
if self.analysis["controversial_points"]:
print(f"\n⚠️ 争议点 ({len(self.analysis['controversial_points'])}个):")
for i, point in enumerate(self.analysis["controversial_points"], 1):
print(f" {i}. {point}")
if self.analysis["ethical_considerations"]:
print(f"\n⚖️ 伦理考量 ({len(self.analysis['ethical_considerations'])}个):")
for i, consideration in enumerate(
self.analysis["ethical_considerations"], 1
):
print(f" {i}. {consideration}")
if self.analysis["key_quotes"]:
print(f"\n💬 关键引述 ({len(self.analysis['key_quotes'])}个):")
for i, quote in enumerate(self.analysis["key_quotes"], 1):
print(f' {i}. "{quote}"')
# 使用示例
if __name__ == "__main__":
video_title = "一个很''的方法,让你喜欢的女人强行爱上你!"
analyzer = VideoContentAnalyzer(video_title)
# 这里应该读取转录文本
# transcript = "转录文本内容..."
# analysis = analyzer.analyze_transcript(transcript)
# 保存分析结果
# analyzer.save_analysis("video_analysis.json")
# analyzer.print_analysis()
print("分析框架已创建,等待转录文本...")
+155
View File
@@ -0,0 +1,155 @@
"""
快速转录方案:如果本地Whisper太慢,尝试其他方法
"""
import os
import subprocess
from pathlib import Path
import json
def extract_audio_from_video(video_path):
"""从视频提取音频"""
audio_path = video_path.with_suffix(".wav")
print(f"提取音频: {video_path.name}{audio_path.name}")
cmd = [
"ffmpeg",
"-y",
"-i",
str(video_path),
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
str(audio_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"音频提取失败: {result.stderr}")
return None
print(f"✅ 音频提取完成: {audio_path.stat().st_size / 1024 / 1024:.2f} MB")
return audio_path
def check_whisper_availability():
"""检查Whisper是否可用"""
try:
import whisper
print("✅ Whisper已安装")
return True
except ImportError:
print("❌ Whisper未安装")
return False
def transcribe_with_whisper(audio_path):
"""使用Whisper转录"""
try:
import whisper
print("加载Whisper模型...")
# 使用最小的模型加快速度
model = whisper.load_model("tiny")
print("开始转录...")
result = model.transcribe(str(audio_path), language="zh")
return result
except Exception as e:
print(f"Whisper转录失败: {e}")
return None
def save_transcription(result, video_path):
"""保存转录结果"""
if not result:
return False
# 保存文本
txt_path = video_path.with_suffix(".txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(result["text"])
print(f"✅ 转录文本保存到: {txt_path.name}")
print(f"文本长度: {len(result['text'])} 字符")
# 保存完整结果(JSON
json_path = video_path.with_suffix(".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ 完整结果保存到: {json_path.name}")
# 预览
print("\n=== 转录预览(前500字符)===")
preview = result["text"][:500]
print(preview + "..." if len(result["text"]) > 500 else preview)
return True
def main():
print("=== 视频转录程序 ===")
# 查找视频文件
video_dir = Path(r"D:\F\NewI\opencode\daily-workspace\temp")
video_files = list(video_dir.glob("*.mp4"))
if not video_files:
print("❌ 未找到mp4文件")
return
video_path = video_files[0]
print(f"处理视频: {video_path.name}")
print(f"文件大小: {video_path.stat().st_size / 1024 / 1024:.2f} MB")
# 检查是否已有转录文件
txt_path = video_path.with_suffix(".txt")
if txt_path.exists():
print(f"✅ 已有转录文件: {txt_path.name}")
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
print(f"文本长度: {len(text)} 字符")
print("\n=== 现有转录预览 ===")
print(text[:500] + "..." if len(text) > 500 else text)
return
# 提取音频
audio_path = extract_audio_from_video(video_path)
if not audio_path:
print("❌ 无法提取音频")
return
# 检查Whisper
if not check_whisper_availability():
print("请安装Whisper: pip install openai-whisper")
return
# 转录
result = transcribe_with_whisper(audio_path)
if not result:
print("❌ 转录失败")
return
# 保存结果
if save_transcription(result, video_path):
print("\n✅ 转录完成!")
# 清理临时音频文件
if audio_path.exists():
audio_path.unlink()
print("临时音频文件已删除")
else:
print("❌ 保存转录结果失败")
if __name__ == "__main__":
main()
@@ -0,0 +1,43 @@
import whisper
import os
from pathlib import Path
# 视频文件路径
video_dir = Path(r"D:\F\NewI\opencode\daily-workspace\temp")
video_files = list(video_dir.glob("*.mp4"))
if not video_files:
print("未找到mp4文件")
exit(1)
video_file = video_files[0]
print(f"找到视频文件: {video_file.name}")
print(f"文件大小: {video_file.stat().st_size / 1024 / 1024:.2f} MB")
# 使用Whisper转录
print("\n加载Whisper模型...")
model = whisper.load_model("base") # 使用base模型,速度较快
print("开始转录...")
result = model.transcribe(str(video_file), language="zh")
# 保存结果
output_file = video_file.with_suffix(".txt")
with open(output_file, "w", encoding="utf-8") as f:
f.write(result["text"])
print(f"\n✅ 转录完成!保存到: {output_file.name}")
print(f"转录文本长度: {len(result['text'])} 字符")
# 显示前500字符预览
print("\n=== 转录预览(前500字符)===")
print(result["text"][:500] + "...")
# 如果有分段信息,也保存
if "segments" in result:
json_file = video_file.with_suffix(".json")
import json
with open(json_file, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"详细分段信息保存到: {json_file.name}")
+108
View File
@@ -0,0 +1,108 @@
import os
import subprocess
import json
import re
from pathlib import Path
# 视频文件路径
video_path = Path(
r'D:\F\NewI\opencode\daily-workspace\temp\一个很""的方法,让你喜欢的女人强行爱上你!.mp4'
)
print(f"处理视频: {video_path.name}")
print(f"文件大小: {video_path.stat().st_size / 1024 / 1024:.2f} MB")
# 1. 提取音频
audio_path = video_path.with_suffix(".wav")
print(f"\n1. 提取音频到: {audio_path.name}")
ffmpeg_cmd = [
"ffmpeg",
"-y",
"-i",
str(video_path),
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
str(audio_path),
]
print(f"运行命令: {' '.join(ffmpeg_cmd[:4])}...")
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"提取音频失败: {result.stderr}")
exit(1)
print("✅ 音频提取完成")
# 2. 检查是否安装了funasr
print("\n2. 检查FunASR安装...")
try:
import funasr
print("✅ FunASR已安装")
except ImportError:
print("❌ FunASR未安装,正在安装...")
subprocess.run(["pip", "install", "funasr", "modelscope"], capture_output=True)
print("✅ FunASR安装完成")
# 3. 转录音频
print("\n3. 开始转录...")
try:
from funasr import AutoModel
# 加载模型
print("加载Paraformer模型...")
model = AutoModel(
model="paraformer-zh",
vad_model="fsmn-vad",
punc_model="ct-punc",
disable_update=True,
)
# 转录
print("转录中...")
result = model.generate(
input=str(audio_path), batch_size_s=300, timestamp_granularity="sentence"
)
# 保存结果
output_path = video_path.with_suffix(".json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"✅ 转录完成,保存到: {output_path.name}")
# 显示摘要
print("\n转录摘要:")
if isinstance(result, list) and len(result) > 0:
full_text = ""
for item in result:
if "text" in item:
full_text += item["text"]
print(f"总字符数: {len(full_text)}")
print(f"句子数: {len(result)}")
print("\n前3句:")
for i, item in enumerate(result[:3]):
if "text" in item:
print(f" {i + 1}. {item['text'][:100]}...")
except Exception as e:
print(f"❌ 转录失败: {e}")
import traceback
traceback.print_exc()
# 4. 清理临时文件
print("\n4. 清理临时文件...")
if audio_path.exists():
audio_path.unlink()
print("✅ 临时音频文件已删除")
print("\n✅ 处理完成!")