用于给llm喂提示词
html去除不必要tag
用于减少token
from bs4 import BeautifulSoup
def remove_class_style(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 遍历所有标签
for tag in soup.find_all(True):
# 保留href和src属性
preserved_attrs = {}
if 'href' in tag.attrs:
preserved_attrs['href'] = tag.attrs['href']
if 'src' in tag.attrs:
preserved_attrs['src'] = tag.attrs['src']
# 清空标签的所有属性
tag.attrs = preserved_attrs
return str(soup)
# 示例HTML内容
html_content = """
<html>
<head>
<title>示例页面</title>
</head>
<body class="main-body" style="background-color: #f0f0f0;">
<div class="content" style="color: red;">
<p class="text" style="font-size: 14px;">这是一个示例段落。</p>
<a href="#" class="link" style="text-decoration: none;">这是一个链接。</a>
</div>
</body>
</html>
"""
# 调用函数移除class和style属性
clean_html = remove_class_style(html_content)
print(clean_html)
html转markdown
from markdownify import markdownify as md
html = "<h1>Hello, World!</h1>"
markdown = md(html)
print(markdown)
提取项目路径-白名单版
import os
import re
# 定义不匹配的规则
exclude_dirs = [r'\.vscode', r'\.git', 'runtime', 'vendor']
# 定义白名单规则
include_files = [r'\.py$', r'\.txt$', r'\.md$'] # 你可以根据需要添加更多文件类型
def should_exclude(path, rules):
return any(re.search(rule, path) for rule in rules)
def should_include(path, rules):
return any(re.search(rule, path) for rule in rules)
def list_files_in_directory(directory, indent_level=0):
try:
items = os.listdir(directory)
except PermissionError:
print('|' + '-' * (indent_level * 2 + 1) + '[Permission Denied]')
return
for item in sorted(items):
item_path = os.path.join(directory, item)
if should_exclude(item_path, exclude_dirs):
continue
if os.path.isdir(item_path):
print('|' + '-' * (indent_level * 2 + 1) + item)
list_files_in_directory(item_path, indent_level + 1)
elif should_include(item_path, include_files):
print('|' + '-' * (indent_level * 2 + 1) + item)
if __name__ == "__main__":
list_files_in_directory("./")
提取项目路径-黑名单版
例子:
- res
- adminui
- dist
- css
- admin.css
- login.css
- modules
- admin.js
- index.js
- view.js
- src
- css
- admin.css
- login.css
- modules
- admin.js
- index.js
- view.js
- config.js
代码
import os
import re
# 定义不匹配的规则
exclude_dirs = [r'\.vscode', r'\.git']
exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$']
def should_exclude(path, rules):
return any(re.search(rule, path) for rule in rules)
def list_files_in_directory(directory, indent_level=0):
try:
items = os.listdir(directory)
except PermissionError:
print(' ' * indent_level * 2 + '- [Permission Denied]')
return
for item in sorted(items):
item_path = os.path.join(directory, item)
if should_exclude(item_path, exclude_dirs):
continue
if os.path.isdir(item_path):
print(' ' * indent_level * 2 + f'- {item}')
list_files_in_directory(item_path, indent_level + 1)
elif not should_exclude(item_path, exclude_files):
print(' ' * indent_level * 2 + f'- {item}')
if __name__ == "__main__":
directory = input("请输入要提取目录的根路径: ")
print(f'- {directory}')
list_files_in_directory(directory)
提取文件
import os
def extract_files_content(directory, root_dir=None, whitelist_extensions=None):
# 检查目录是否存在
if not os.path.exists(directory):
print(f"路径 {directory} 不存在")
return
# 获取根目录路径
if root_dir is None:
root_dir = directory
# 默认白名单为空
if whitelist_extensions is None:
whitelist_extensions = []
# 遍历目录下的文件和子目录
with open("1.txt", "a", encoding="utf-8") as f:
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
print(filename)
# 检查文件后缀是否在白名单中
if any(filename.endswith(ext) for ext in whitelist_extensions):
# 写入相对于根目录的文件路径
relative_filepath = os.path.relpath(filepath, root_dir)
f.write(relative_filepath + '\n')
f.write("```\n")
# 读取文件内容并写入
with open(filepath, 'r', encoding='utf-8') as file:
content = file.read()
f.write(content + '\n')
f.write("```\n\n")
elif os.path.isdir(filepath):
# 递归处理子目录
extract_files_content(filepath, root_dir, whitelist_extensions)
# 提取指定路径下所有文件名和内容
current_directory = "D:\\Code\\py\\docs" # 修改路径为正确的路径
whitelist_extensions = ['.md'] # 设置白名单后缀
# 先清空
with open("1.txt", "w", encoding="utf-8") as f:
f.write("")
extract_files_content(current_directory, whitelist_extensions=whitelist_extensions)
提取目录文件转为下划线,方便导入知识库
import os
import re
# 定义不匹配的规则
exclude_dirs = [r'\.vscode', r'\.git']
exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$']
def should_exclude(path, rules):
return any(re.search(rule, path) for rule in rules)
def extract_files_content(directory, root_dir=None, whitelist_extensions=None):
# 检查目录是否存在
if not os.path.exists(directory):
print(f"路径 {directory} 不存在")
return
# 获取根目录路径
if root_dir is None:
root_dir = directory
# 默认白名单为空
if whitelist_extensions is None:
whitelist_extensions = []
# 遍历目录下的文件和子目录
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
# 检查文件后缀是否在白名单中
if any(filename.endswith(ext) for ext in whitelist_extensions):
# 构建文件名格式:一级文件夹_可能的下一级文件夹_...._最终文件名字
relative_filepath = os.path.relpath(filepath, root_dir)
base_filename = os.path.basename(relative_filepath)
structured_filename = '_'.join(relative_filepath.split(os.sep)[:-1] + [base_filename])
# 写入文件内容
with open("tmp/" + structured_filename, "w", encoding="utf-8") as f:
# 读取文件内容并写入
with open(filepath, 'r', encoding='utf-8') as file:
content = file.read()
f.write(content)
elif os.path.isdir(filepath) and not should_exclude(filepath, exclude_dirs):
# 递归处理子目录
extract_files_content(filepath, root_dir, whitelist_extensions)
# 提取指定路径下所有文件名和内容
current_directory = "docs" # 修改路径为正确的路径
whitelist_extensions = ['.md'] # 设置白名单后缀
extract_files_content(current_directory, whitelist_extensions=whitelist_extensions)