feat: 添加数据匹配分析脚本,计算 loops_power 与 g5 表的匹配覆盖率

This commit is contained in:
2026-04-09 11:45:39 +08:00
parent f7df419e21
commit 6de7b91094
2 changed files with 61 additions and 140 deletions

View File

@@ -0,0 +1,61 @@
import os
import pandas as pd
from sqlalchemy import create_engine
import urllib.parse
from db_config import get_db_url
def main():
print("正在连接数据库...")
url_80portal = get_db_url("PORTAL")
url_test = get_db_url("TARGET")
engine_80portal = create_engine(url_80portal)
engine_test = create_engine(url_test)
print("\n[1/3] 从 15433 test 库提取 loops_power 的数据...")
df_loops = pd.read_sql('SELECT hotel_id, room_id FROM "wh_test"."loops_power"', engine_test)
print("[2/3] 从 15434 log_platform 库提取 room_status_moment_g5 的数据...")
df_g5 = pd.read_sql('SELECT hotel_id, room_id FROM "room_status"."room_status_moment_g5"', engine_80portal)
print("\n[3/3] 正在对齐类型并进行交集运算...")
# 彻底数据清洗,防止由于 PostgreSQL 类型或字符串空格导致匹配不上
df_loops['hotel_id'] = pd.to_numeric(df_loops['hotel_id'], errors='coerce').astype('Int64')
df_loops['room_id'] = df_loops['room_id'].astype(str).str.strip().str.replace('.0', '', regex=False)
df_g5['hotel_id'] = pd.to_numeric(df_g5['hotel_id'], errors='coerce').astype('Int64')
df_g5['room_id'] = df_g5['room_id'].astype(str).str.strip().str.replace('.0', '', regex=False)
# 针对 g5 表去重建立参考系
df_g5_unique = df_g5.drop_duplicates(subset=['hotel_id', 'room_id']).copy()
df_g5_unique['g5_exists'] = True
# 分析覆盖情况
total_loops = len(df_loops)
merged = pd.merge(df_loops, df_g5_unique, on=['hotel_id', 'room_id'], how='left')
matched_loops = merged['g5_exists'].sum()
match_rate = (matched_loops / total_loops * 100) if total_loops > 0 else 0
print("=======================================")
print("匹配结果报告:")
print("=======================================")
print(f"-> 提取的 loops_power 总记录: {total_loops}")
print(f"-> 在 g5 表中成功映射找到的记录: {int(matched_loops)}")
print(f"-> 整体匹配覆盖率: {match_rate:.2f}%")
# 挑出匹配上的数据
mapped = merged[merged['g5_exists'] == True]
if len(mapped) > 0:
print(f"\n[√] 成功匹配配对示例(前10种):")
print(mapped[['hotel_id', 'room_id']].drop_duplicates().head(10).to_string(index=False))
# 挑出无法在 G5 找到的烂数据
unmapped = merged[merged['g5_exists'] != True]
if len(unmapped) > 0:
print(f"\n[!] 发现 {len(unmapped)} 条扁平化回路未在 g5 表中找到对应的 hotel_id/room_id。以下为缺失配对示例(前10种):")
print(unmapped[['hotel_id', 'room_id']].drop_duplicates().head(10).to_string(index=False))
if __name__ == "__main__":
main()