MySQL8表中有大量重复数据去重后保存到新表方案
MySQL8表中有大量重复数据去重后保存到新表方案
一,找deepseek来解决
MySQL8 数据库中有两张表:
表 mat_transport_license,创建表SQL如下:
CREATE TABLE `mat_transport_license` (
`ID` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
`PERM_ID` int DEFAULT NULL COMMENT '微机编号',
`PRINT_ID` varchar(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '纸张号',
`ORDER_ID` varchar(35) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '凭证编号',
UNIQUE KEY `PERM_ID` (`PERM_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='准运证';
表 mat_transport_license_detail ,创建表SQL如下:
CREATE TABLE `mat_transport_license_detail` (
`ZYB_ID` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
`PERM_ID` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
`ORDER_ID` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
`SPEC` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC ;
现需要根据表 mat_transport_license 中的 PERM_ID字段中的数据匹配 mat_transport_license_detail中 PERM_ID字段中的数据,
但 mat_transport_license_detail中有大量重复数据,需要根据 两表 PERM_ID字段关联数据,提取 mat_transport_license_detail中 MAX(ZYB_ID)行的数据,插入到新表 mat_transport_license_detail_new 。
因为表有大量数据约5千万条, 需要使用存储过程,分批执行,请给出优化方案:
二,DeeSeek 多次回复后提到的最终结果:
错误原因很明确:字符集排序规则(collation)冲突。
明细表
mat_transport_license_detail的字段使用utf8mb4_general_ciCAST(lic.PERM_ID AS CHAR)没有指定排序规则,会继承数据库或会话的默认排序规则(MySQL 8.0 默认是utf8mb4_0900_ai_ci)两者在
=比较时混用,导致1267错误。
解决方法:在转换时显式指定与明细表相同的排序规则,使用 COLLATE utf8mb4_general_ci。
版本一(窗口函数版,已修正排序规则)
sql
DELIMITER $$
DROP PROCEDURE IF EXISTS proc_batch_max_zyb$$
CREATE PROCEDURE proc_batch_max_zyb(IN p_batch_count INT)
BEGIN
DECLARE v_min_id INT;
DECLARE v_max_id INT;
DECLARE v_start INT DEFAULT 0;
DECLARE v_end INT;
DECLARE v_batch_rows INT;
DECLARE v_total_rows INT DEFAULT 0;
DECLARE v_done INT DEFAULT 0;
SELECT MIN(PERM_ID), MAX(PERM_ID) INTO v_min_id, v_max_id
FROM mat_transport_license WHERE PERM_ID IS NOT NULL;
IF v_min_id IS NOT NULL THEN
IF p_batch_count IS NULL OR p_batch_count <= 0 THEN
SET p_batch_count = 5000;
END IF;
WHILE v_done = 0 DO
SET v_end = v_start + p_batch_count;
IF v_end > v_max_id THEN
SET v_end = v_max_id;
SET v_done = 1;
END IF;
START TRANSACTION;
INSERT INTO mat_transport_license_detail_new
SELECT
ZYB_ID, PERM_ID, ORDER_ID, SERIAL, PRODUCT_ID,
PRODUCT_NAME, MADE_IN, SPEC, UNIT, PRICE, QUANTITY, ZYB_TID
FROM (
SELECT
det.*,
ROW_NUMBER() OVER (
PARTITION BY det.PERM_ID ORDER BY det.ZYB_ID DESC
) AS rn
FROM mat_transport_license_detail det
WHERE det.PERM_ID IN (
SELECT CAST(lic.PERM_ID AS CHAR CHARACTER SET utf8mb4)
COLLATE utf8mb4_general_ci
FROM mat_transport_license lic
WHERE lic.PERM_ID > v_start AND lic.PERM_ID <= v_end
)
) t
WHERE t.rn = 1;
SET v_batch_rows = ROW_COUNT();
SET v_total_rows = v_total_rows + v_batch_rows;
COMMIT;
SELECT CONCAT('Batch [', v_start+1, ' ~ ', v_end, '] inserted ',
v_batch_rows, ' rows, total: ', v_total_rows) AS progress;
SET v_start = v_end;
END WHILE;
SELECT CONCAT('All done. Total rows: ', v_total_rows) AS final_message;
ELSE
SELECT 'No data in mat_transport_license' AS msg;
END IF;
END$$
DELIMITER ;
版本二(无窗口函数版,同样修正)
sql
DELIMITER $$
DROP PROCEDURE IF EXISTS proc_batch_max_zyb_no_window$$
CREATE PROCEDURE proc_batch_max_zyb_no_window(IN p_batch_count INT)
BEGIN
DECLARE v_min_id INT;
DECLARE v_max_id INT;
DECLARE v_start INT DEFAULT 0;
DECLARE v_end INT;
DECLARE v_batch_rows INT;
DECLARE v_total_rows INT DEFAULT 0;
DECLARE v_done INT DEFAULT 0;
SELECT MIN(PERM_ID), MAX(PERM_ID) INTO v_min_id, v_max_id
FROM mat_transport_license WHERE PERM_ID IS NOT NULL;
IF v_min_id IS NOT NULL THEN
IF p_batch_count IS NULL OR p_batch_count <= 0 THEN
SET p_batch_count = 5000;
END IF;
WHILE v_done = 0 DO
SET v_end = v_start + p_batch_count;
IF v_end > v_max_id THEN
SET v_end = v_max_id;
SET v_done = 1;
END IF;
START TRANSACTION;
INSERT INTO mat_transport_license_detail_new
SELECT d.*
FROM mat_transport_license_detail d
INNER JOIN (
SELECT PERM_ID, MAX(ZYB_ID) AS max_zyb
FROM mat_transport_license_detail
WHERE PERM_ID IN (
SELECT CAST(lic.PERM_ID AS CHAR CHARACTER SET utf8mb4)
COLLATE utf8mb4_general_ci
FROM mat_transport_license lic
WHERE lic.PERM_ID > v_start AND lic.PERM_ID <= v_end
)
GROUP BY PERM_ID
) t ON d.PERM_ID = t.PERM_ID AND d.ZYB_ID = t.max_zyb;
SET v_batch_rows = ROW_COUNT();
SET v_total_rows = v_total_rows + v_batch_rows;
COMMIT;
SELECT CONCAT('Batch [', v_start+1, ' ~ ', v_end, '] inserted ',
v_batch_rows, ' rows, total: ', v_total_rows) AS progress;
SET v_start = v_end;
END WHILE;
SELECT CONCAT('All done. Total rows: ', v_total_rows) AS final_message;
ELSE
SELECT 'No data in mat_transport_license' AS msg;
END IF;
END$$
DELIMITER ;
执行步骤
确认索引和目标表已存在(如果还未执行):
sqlALTER TABLE mat_transport_license_detail ADD INDEX idx_perm_zyb (PERM_ID, ZYB_ID);
CREATE TABLE mat_transport_license_detail_new LIKE mat_transport_license_detail;调用存储过程:
sqlCALL proc_batch_max_zyb(5000);校验、补索引(同前)。
这两个版本已将子查询中的字符串转换强制设为 utf8mb4_general_ci,与明细表字段完全一致,不会再报 1267 错误。
目录 返回
首页