用于给llm喂提示词 # html去除不必要tag 用于减少token ``` from bs4 import BeautifulSoup def remove_class_style(html_content): soup = BeautifulSoup(html_content, 'html.parser') # 遍历所有标签 for tag in soup.find_all(True): # 保留href和src属性 preserved_attrs = {} if 'href' in tag.attrs: preserved_attrs['href'] = tag.attrs['href'] if 'src' in tag.attrs: preserved_attrs['src'] = tag.attrs['src'] # 清空标签的所有属性 tag.attrs = preserved_attrs return str(soup) # 示例HTML内容 html_content = """ 示例页面 这是一个示例段落。 这是一个链接。 """ # 调用函数移除class和style属性 clean_html = remove_class_style(html_content) print(clean_html) ``` # html转markdown ``` from markdownify import markdownify as md html = "Hello, World!" markdown = md(html) print(markdown) ``` # 提取项目路径-白名单版 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git', 'runtime', 'vendor'] # 定义白名单规则 include_files = [r'\.py$', r'\.txt$', r'\.md$'] # 你可以根据需要添加更多文件类型 def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def should_include(path, rules): return any(re.search(rule, path) for rule in rules) def list_files_in_directory(directory, indent_level=0): try: items = os.listdir(directory) except PermissionError: print('|' + '-' * (indent_level * 2 + 1) + '[Permission Denied]') return for item in sorted(items): item_path = os.path.join(directory, item) if should_exclude(item_path, exclude_dirs): continue if os.path.isdir(item_path): print('|' + '-' * (indent_level * 2 + 1) + item) list_files_in_directory(item_path, indent_level + 1) elif should_include(item_path, include_files): print('|' + '-' * (indent_level * 2 + 1) + item) if __name__ == "__main__": list_files_in_directory("./") ``` # 提取项目路径-黑名单版 例子: ``` - res - adminui - dist - css - admin.css - login.css - modules - admin.js - index.js - view.js - src - css - admin.css - login.css - modules - admin.js - index.js - view.js - config.js ``` 代码 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git'] exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$'] def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def list_files_in_directory(directory, indent_level=0): try: items = os.listdir(directory) except PermissionError: print(' ' * indent_level * 2 + '- [Permission Denied]') return for item in sorted(items): item_path = os.path.join(directory, item) if should_exclude(item_path, exclude_dirs): continue if os.path.isdir(item_path): print(' ' * indent_level * 2 + f'- {item}') list_files_in_directory(item_path, indent_level + 1) elif not should_exclude(item_path, exclude_files): print(' ' * indent_level * 2 + f'- {item}') if __name__ == "__main__": directory = input("请输入要提取目录的根路径: ") print(f'- {directory}') list_files_in_directory(directory) ``` # 提取文件 ``` import os def extract_files_content(directory, root_dir=None, whitelist_extensions=None): # 检查目录是否存在 if not os.path.exists(directory): print(f"路径 {directory} 不存在") return # 获取根目录路径 if root_dir is None: root_dir = directory # 默认白名单为空 if whitelist_extensions is None: whitelist_extensions = [] # 遍历目录下的文件和子目录 with open("1.txt", "a", encoding="utf-8") as f: for filename in os.listdir(directory): filepath = os.path.join(directory, filename) if os.path.isfile(filepath): print(filename) # 检查文件后缀是否在白名单中 if any(filename.endswith(ext) for ext in whitelist_extensions): # 写入相对于根目录的文件路径 relative_filepath = os.path.relpath(filepath, root_dir) f.write(relative_filepath + '\n') f.write("```\n") # 读取文件内容并写入 with open(filepath, 'r', encoding='utf-8') as file: content = file.read() f.write(content + '\n') f.write("```\n\n") elif os.path.isdir(filepath): # 递归处理子目录 extract_files_content(filepath, root_dir, whitelist_extensions) # 提取指定路径下所有文件名和内容 current_directory = "D:\\Code\\py\\docs" # 修改路径为正确的路径 whitelist_extensions = ['.md'] # 设置白名单后缀 # 先清空 with open("1.txt", "w", encoding="utf-8") as f: f.write("") extract_files_content(current_directory, whitelist_extensions=whitelist_extensions) ``` 提取目录文件转为下划线,方便导入知识库 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git'] exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$'] def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def extract_files_content(directory, root_dir=None, whitelist_extensions=None): # 检查目录是否存在 if not os.path.exists(directory): print(f"路径 {directory} 不存在") return # 获取根目录路径 if root_dir is None: root_dir = directory # 默认白名单为空 if whitelist_extensions is None: whitelist_extensions = [] # 遍历目录下的文件和子目录 for filename in os.listdir(directory): filepath = os.path.join(directory, filename) if os.path.isfile(filepath): # 检查文件后缀是否在白名单中 if any(filename.endswith(ext) for ext in whitelist_extensions): # 构建文件名格式:一级文件夹_可能的下一级文件夹_...._最终文件名字 relative_filepath = os.path.relpath(filepath, root_dir) base_filename = os.path.basename(relative_filepath) structured_filename = '_'.join(relative_filepath.split(os.sep)[:-1] + [base_filename]) # 写入文件内容 with open("tmp/" + structured_filename, "w", encoding="utf-8") as f: # 读取文件内容并写入 with open(filepath, 'r', encoding='utf-8') as file: content = file.read() f.write(content) elif os.path.isdir(filepath) and not should_exclude(filepath, exclude_dirs): # 递归处理子目录 extract_files_content(filepath, root_dir, whitelist_extensions) # 提取指定路径下所有文件名和内容 current_directory = "docs" # 修改路径为正确的路径 whitelist_extensions = ['.md'] # 设置白名单后缀 extract_files_content(current_directory, whitelist_extensions=whitelist_extensions) ``` Loading... 用于给llm喂提示词 # html去除不必要tag 用于减少token ``` from bs4 import BeautifulSoup def remove_class_style(html_content): soup = BeautifulSoup(html_content, 'html.parser') # 遍历所有标签 for tag in soup.find_all(True): # 保留href和src属性 preserved_attrs = {} if 'href' in tag.attrs: preserved_attrs['href'] = tag.attrs['href'] if 'src' in tag.attrs: preserved_attrs['src'] = tag.attrs['src'] # 清空标签的所有属性 tag.attrs = preserved_attrs return str(soup) # 示例HTML内容 html_content = """ <html> <head> <title>示例页面</title> </head> <body class="main-body" style="background-color: #f0f0f0;"> <div class="content" style="color: red;"> <p class="text" style="font-size: 14px;">这是一个示例段落。</p> <a href="#" class="link" style="text-decoration: none;">这是一个链接。</a> </div> </body> </html> """ # 调用函数移除class和style属性 clean_html = remove_class_style(html_content) print(clean_html) ``` # html转markdown ``` from markdownify import markdownify as md html = "<h1>Hello, World!</h1>" markdown = md(html) print(markdown) ``` # 提取项目路径-白名单版 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git', 'runtime', 'vendor'] # 定义白名单规则 include_files = [r'\.py$', r'\.txt$', r'\.md$'] # 你可以根据需要添加更多文件类型 def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def should_include(path, rules): return any(re.search(rule, path) for rule in rules) def list_files_in_directory(directory, indent_level=0): try: items = os.listdir(directory) except PermissionError: print('|' + '-' * (indent_level * 2 + 1) + '[Permission Denied]') return for item in sorted(items): item_path = os.path.join(directory, item) if should_exclude(item_path, exclude_dirs): continue if os.path.isdir(item_path): print('|' + '-' * (indent_level * 2 + 1) + item) list_files_in_directory(item_path, indent_level + 1) elif should_include(item_path, include_files): print('|' + '-' * (indent_level * 2 + 1) + item) if __name__ == "__main__": list_files_in_directory("./") ``` # 提取项目路径-黑名单版 例子: ``` - res - adminui - dist - css - admin.css - login.css - modules - admin.js - index.js - view.js - src - css - admin.css - login.css - modules - admin.js - index.js - view.js - config.js ``` 代码 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git'] exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$'] def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def list_files_in_directory(directory, indent_level=0): try: items = os.listdir(directory) except PermissionError: print(' ' * indent_level * 2 + '- [Permission Denied]') return for item in sorted(items): item_path = os.path.join(directory, item) if should_exclude(item_path, exclude_dirs): continue if os.path.isdir(item_path): print(' ' * indent_level * 2 + f'- {item}') list_files_in_directory(item_path, indent_level + 1) elif not should_exclude(item_path, exclude_files): print(' ' * indent_level * 2 + f'- {item}') if __name__ == "__main__": directory = input("请输入要提取目录的根路径: ") print(f'- {directory}') list_files_in_directory(directory) ``` # 提取文件 ``` import os def extract_files_content(directory, root_dir=None, whitelist_extensions=None): # 检查目录是否存在 if not os.path.exists(directory): print(f"路径 {directory} 不存在") return # 获取根目录路径 if root_dir is None: root_dir = directory # 默认白名单为空 if whitelist_extensions is None: whitelist_extensions = [] # 遍历目录下的文件和子目录 with open("1.txt", "a", encoding="utf-8") as f: for filename in os.listdir(directory): filepath = os.path.join(directory, filename) if os.path.isfile(filepath): print(filename) # 检查文件后缀是否在白名单中 if any(filename.endswith(ext) for ext in whitelist_extensions): # 写入相对于根目录的文件路径 relative_filepath = os.path.relpath(filepath, root_dir) f.write(relative_filepath + '\n') f.write("```\n") # 读取文件内容并写入 with open(filepath, 'r', encoding='utf-8') as file: content = file.read() f.write(content + '\n') f.write("```\n\n") elif os.path.isdir(filepath): # 递归处理子目录 extract_files_content(filepath, root_dir, whitelist_extensions) # 提取指定路径下所有文件名和内容 current_directory = "D:\\Code\\py\\docs" # 修改路径为正确的路径 whitelist_extensions = ['.md'] # 设置白名单后缀 # 先清空 with open("1.txt", "w", encoding="utf-8") as f: f.write("") extract_files_content(current_directory, whitelist_extensions=whitelist_extensions) ``` 提取目录文件转为下划线,方便导入知识库 ``` import os import re # 定义不匹配的规则 exclude_dirs = [r'\.vscode', r'\.git'] exclude_files = [r'\.png$', r'\.jpg$', r'\.jpeg$', r'\.gif$', r'\.bmp$', r'\.pyc$'] def should_exclude(path, rules): return any(re.search(rule, path) for rule in rules) def extract_files_content(directory, root_dir=None, whitelist_extensions=None): # 检查目录是否存在 if not os.path.exists(directory): print(f"路径 {directory} 不存在") return # 获取根目录路径 if root_dir is None: root_dir = directory # 默认白名单为空 if whitelist_extensions is None: whitelist_extensions = [] # 遍历目录下的文件和子目录 for filename in os.listdir(directory): filepath = os.path.join(directory, filename) if os.path.isfile(filepath): # 检查文件后缀是否在白名单中 if any(filename.endswith(ext) for ext in whitelist_extensions): # 构建文件名格式:一级文件夹_可能的下一级文件夹_...._最终文件名字 relative_filepath = os.path.relpath(filepath, root_dir) base_filename = os.path.basename(relative_filepath) structured_filename = '_'.join(relative_filepath.split(os.sep)[:-1] + [base_filename]) # 写入文件内容 with open("tmp/" + structured_filename, "w", encoding="utf-8") as f: # 读取文件内容并写入 with open(filepath, 'r', encoding='utf-8') as file: content = file.read() f.write(content) elif os.path.isdir(filepath) and not should_exclude(filepath, exclude_dirs): # 递归处理子目录 extract_files_content(filepath, root_dir, whitelist_extensions) # 提取指定路径下所有文件名和内容 current_directory = "docs" # 修改路径为正确的路径 whitelist_extensions = ['.md'] # 设置白名单后缀 extract_files_content(current_directory, whitelist_extensions=whitelist_extensions) ``` Last modification:October 14, 2024 © Allow specification reprint Like 如果觉得我的文章对你有用,请随意赞赏