
1 OWUI启动脚本
1# Open-WebUI Settings 2export DATA_DIR='data0331' 3export ENABLE_SIGNUP=True 4export DEFAULT_USER_ROLE='admin' 5export DEFAULT_GROUP_ID='xai' 6export OFFLINE_MODE=false 7export HF_HUB_OFFLINE=1 8 9# OpenAI API 配置 10export ENABLE_OLLAMA_API=false 11export ENABLE_OPENAI_API=true 12export OPENAI_API_BASE_URLS='https://api.siliconflow.cn/v1;' 13export OPENAI_API_KEYS='sk-xxx;' 14 15# Activate conda environment 16source ~/miniconda3/etc/profile.d/conda.sh 17conda activate owui-0.6.43 18 19# Start Open-WebUI 20open-webui serve --port 9999 21 22# pkill -f "open-webui --port 9999" && sleep 1 && ps aux | grep open-webui | grep -v grep || echo "已停止" 23 24
2 Chat提取脚本1

1#!/usr/bin/env python3 2""" 3从webui.db提取chat表数据,计算每个chat中所有content的字符长度,保存到新sqlite文件 4""" 5 6import sqlite3 7import json 8import os 9 10# 配置 11SOURCE_DB = "data0331/webui.db" 12TARGET_DB = "chat_content_stats.db" 13 14 15def extract_content_length(chat_json: str) -> int: 16 """ 17 从chat JSON中提取所有message的content字符长度总和 18 """ 19 if not chat_json: 20 return 0 21 22 try: 23 data = json.loads(chat_json) 24 except json.JSONDecodeError: 25 return 0 26 27 total_length = 0 28 29 # Open-WebUI的chat JSON结构: {"history": {"messages": {...}}} 30 messages = data.get("history", {}).get("messages", {}) 31 32 for msg_id, msg in messages.items(): 33 if isinstance(msg, dict): 34 content = msg.get("content", "") 35 if content: 36 total_length += len(str(content)) 37 38 return total_length 39 40 41def main(): 42 # 检查源数据库是否存在 43 if not os.path.exists(SOURCE_DB): 44 print(f"错误: 源数据库不存在: {SOURCE_DB}") 45 return 46 47 # 连接源数据库 48 source_conn = sqlite3.connect(SOURCE_DB) 49 source_cursor = source_conn.cursor() 50 51 # 获取所有chat记录 52 source_cursor.execute(""" 53 SELECT user_id, created_at, chat 54 FROM chat 55 ORDER BY created_at 56 """) 57 rows = source_cursor.fetchall() 58 print(f"从 {SOURCE_DB} 读取到 {len(rows)} 条chat记录") 59 60 # 创建目标数据库 61 if os.path.exists(TARGET_DB): 62 os.remove(TARGET_DB) 63 64 target_conn = sqlite3.connect(TARGET_DB) 65 target_cursor = target_conn.cursor() 66 67 # 创建目标表 68 target_cursor.execute(""" 69 CREATE TABLE chat_content_stats ( 70 user_id TEXT NOT NULL, 71 created_at TEXT NOT NULL, 72 content_length INTEGER NOT NULL 73 ) 74 """) 75 76 # 插入数据 77 processed = 0 78 for user_id, created_at, chat_json in rows: 79 content_length = extract_content_length(chat_json) 80 target_cursor.execute(""" 81 INSERT INTO chat_content_stats (user_id, created_at, content_length) 82 VALUES (?, ?, ?) 83 """, (user_id, created_at, content_length)) 84 processed += 1 85 86 target_conn.commit() 87 88 # 显示统计信息 89 target_cursor.execute("SELECT COUNT(*), SUM(content_length), AVG(content_length) FROM chat_content_stats") 90 count, total, avg = target_cursor.fetchone() 91 92 print(f"\n处理完成!") 93 print(f"输出文件: {TARGET_DB}") 94 print(f"总记录数: {count}") 95 print(f"总字符数: {total:,}") 96 print(f"平均每条chat字符数: {avg:.2f}") 97 98 # 显示前5条示例 99 print("\n前5条数据示例:") 100 target_cursor.execute("SELECT user_id, created_at, content_length FROM chat_content_stats LIMIT 5") 101 for row in target_cursor.fetchall(): 102 print(f" user_id={row[0][:8]}..., created_at={row[1]}, content_length={row[2]:,}") 103 104 # 关闭连接 105 source_conn.close() 106 target_conn.close() 107 108 109if __name__ == "__main__": 110 main() 111 112
3 Chat提取脚本2

1#!/usr/bin/env python3 2""" 3从webui.db提取chat表数据,计算每个chat中所有content的字符长度,保存到新sqlite文件 4""" 5 6import sqlite3 7import json 8import os 9 10import re 11 12 13# 配置 14SOURCE_DB = "data0331/webui.db" 15TARGET_DB = "chat_content_stats.db" 16 17 18def extract_content_stats(chat_json: str) -> dict: 19 """ 20 从chat JSON中提取内容统计:中文字、英文字母、数字、标点符号、空格、emoji、其他 21 """ 22 if not chat_json: 23 return { 24 "chinese": 0, "english": 0, "digit": 0, 25 "punctuation": 0, "space": 0, "emoji": 0, "other": 0 26 } 27 28 try: 29 data = json.loads(chat_json) 30 except json.JSONDecodeError: 31 return { 32 "chinese": 0, "english": 0, "digit": 0, 33 "punctuation": 0, "space": 0, "emoji": 0, "other": 0 34 } 35 36 chinese_count = 0 37 english_count = 0 38 digit_count = 0 39 punctuation_count = 0 40 space_count = 0 41 emoji_count = 0 42 other_count = 0 43 44 # 标点符号定义 45 punctuation_chars = set(".,;:!?-'\"()[]{}<>`~@#$%^&*+=_|\\/·,。;:!?、""''()【】《》「」『』〔〕~@#¥%……&*——+|\\") 46 47 # Emoji范围检测 48 def is_emoji(char): 49 code = ord(char) 50 return ( 51 0x1F600 <= code <= 0x1F64F or # 表情符号 52 0x1F300 <= code <= 0x1F5FF or # 符号和象形文字 53 0x1F680 <= code <= 0x1F6FF or # 交通和地图符号 54 0x1F700 <= code <= 0x1F77F or # 其他符号 55 0x1F780 <= code <= 0x1F7FF or # 几何图形 56 0x1F900 <= code <= 0x1F9FF or # 补充符号 57 0x1FA00 <= code <= 0x1FA6F or # 扩展符号 58 0x2600 <= code <= 0x26FF or # 杂项符号 59 0x2700 <= code <= 0x27BF or # 装饰符号 60 0x2B50 <= code <= 0x2B55 or # 更多符号 61 0x1F1E0 <= code <= 0x1F1FF # 国旗 62 ) 63 64 # Open-WebUI的chat JSON结构: {"history": {"messages": {...}}} 65 messages = data.get("history", {}).get("messages", {}) 66 67 for msg_id, msg in messages.items(): 68 if isinstance(msg, dict): 69 content = str(msg.get("content", "")) 70 for char in content: 71 if '\u4e00' <= char <= '\u9fff': # 中文字符范围 72 chinese_count += 1 73 elif char.isalpha(): # 英文字母 74 english_count += 1 75 elif char.isdigit(): # 数字 76 digit_count += 1 77 elif char.isspace(): # 空格、换行等空白字符 78 space_count += 1 79 elif is_emoji(char): # Emoji 80 emoji_count += 1 81 elif char in punctuation_chars: # 标点符号 82 punctuation_count += 1 83 else: 84 other_count += 1 85 86 return { 87 "chinese": chinese_count, 88 "english": english_count, 89 "digit": digit_count, 90 "punctuation": punctuation_count, 91 "space": space_count, 92 "emoji": emoji_count, 93 "other": other_count 94 } 95 96 97def main(): 98 # 检查源数据库是否存在 99 if not os.path.exists(SOURCE_DB): 100 print(f"错误: 源数据库不存在: {SOURCE_DB}") 101 return 102 103 # 连接源数据库 104 source_conn = sqlite3.connect(SOURCE_DB) 105 source_cursor = source_conn.cursor() 106 107 # 获取所有chat记录,包含user_id 108 source_cursor.execute(""" 109 SELECT user_id, created_at, chat 110 FROM chat 111 ORDER BY created_at 112 """) 113 rows = source_cursor.fetchall() 114 print(f"从 {SOURCE_DB} 读取到 {len(rows)} 条chat记录") 115 116 # 获取所有用户的email映射 {user_id: email} 117 source_cursor.execute("SELECT id, email FROM user") 118 user_emails = dict(source_cursor.fetchall()) 119 print(f"从 user 表读取到 {len(user_emails)} 个用户") 120 121 # 创建目标数据库 122 if os.path.exists(TARGET_DB): 123 os.remove(TARGET_DB) 124 125 target_conn = sqlite3.connect(TARGET_DB) 126 target_cursor = target_conn.cursor() 127 128 # 创建目标表 129 target_cursor.execute(""" 130 CREATE TABLE chat_content_stats ( 131 user_id TEXT NOT NULL, 132 email TEXT, 133 created_at TEXT NOT NULL, 134 chinese INTEGER NOT NULL DEFAULT 0, 135 english INTEGER NOT NULL DEFAULT 0, 136 digit INTEGER NOT NULL DEFAULT 0, 137 punctuation INTEGER NOT NULL DEFAULT 0, 138 space INTEGER NOT NULL DEFAULT 0, 139 emoji INTEGER NOT NULL DEFAULT 0, 140 other INTEGER NOT NULL DEFAULT 0 141 ) 142 """) 143 144 # 插入数据 145 processed = 0 146 for user_id, created_at, chat_json in rows: 147 stats = extract_content_stats(chat_json) 148 email = user_emails.get(user_id, "unknown") 149 target_cursor.execute(""" 150 INSERT INTO chat_content_stats 151 (user_id, email, created_at, chinese, english, digit, punctuation, space, emoji, other) 152 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 153 """, (user_id, email, created_at, stats["chinese"], stats["english"], stats["digit"], 154 stats["punctuation"], stats["space"], stats["emoji"], stats["other"])) 155 processed += 1 156 157 target_conn.commit() 158 159 # 显示统计信息 160 target_cursor.execute(""" 161 SELECT 162 COUNT(*), 163 SUM(chinese), 164 SUM(english), 165 SUM(digit), 166 SUM(punctuation), 167 SUM(space), 168 SUM(emoji), 169 SUM(other), 170 AVG(chinese + english + digit + punctuation + space + emoji + other) 171 FROM chat_content_stats 172 """) 173 result = target_cursor.fetchone() 174 count = result[0] 175 total_chinese, total_english, total_digit, total_punct = result[1:5] 176 total_space, total_emoji, total_other = result[5:8] 177 avg = result[8] 178 179 # 计算总计 180 grand_total = sum([total_chinese, total_english, total_digit, 181 total_punct, total_space, total_emoji, total_other]) 182 183 print(f"\n处理完成!") 184 print(f"输出文件: {TARGET_DB}") 185 print(f"总记录数: {count}") 186 print(f"\n【字符类型统计】") 187 print(f" 中文字符: {total_chinese:,}") 188 print(f" 英文字母: {total_english:,}") 189 print(f" 数字: {total_digit:,}") 190 print(f" 标点符号: {total_punct:,}") 191 print(f" 空格换行: {total_space:,}") 192 print(f" Emoji: {total_emoji:,}") 193 print(f" 其他字符: {total_other:,}") 194 print(f" ─────────") 195 print(f" 总计: {grand_total:,}") 196 print(f"\n平均每条chat字符数: {avg:.2f}") 197 198 # 显示前5条示例 199 print("\n【前3条数据示例】:") 200 target_cursor.execute(""" 201 SELECT user_id, email, created_at, chinese, english, digit, 202 punctuation, space, emoji, other 203 FROM chat_content_stats 204 LIMIT 3 205 """) 206 for i, row in enumerate(target_cursor.fetchall()): 207 total = sum(row[3:]) 208 print(f"\n 记录 {i+1}:") 209 print(f" user_id={row[0][:8]}..., email={row[1]}") 210 print(f" created_at={row[2]}") 211 print(f" 中文={row[3]:,}, 英文={row[4]:,}, 数字={row[5]:,}") 212 print(f" 标点={row[6]:,}, 空格={row[7]:,}, emoji={row[8]:,}, 其他={row[9]:,}") 213 print(f" 总计={total:,}") 214 215 # 关闭连接 216 source_conn.close() 217 target_conn.close() 218 219 220if __name__ == "__main__": 221 main() 222 223
《260331-OpenWebUI统计所有Chat的对话字符个数》 是转载文章,点击查看原文。