引言:理解数据库去重的重要性
在现代软件开发中,数据重复是一个常见且棘手的问题。无论是用户意外提交的重复订单、系统同步产生的冗余记录,还是数据迁移过程中导入的重复数据,都会严重影响数据库的性能和数据的准确性。作为Java开发者,掌握高效的去重技术不仅能提升应用性能,还能确保数据的一致性。
数据库去重不仅仅是简单的删除重复行,它涉及到数据完整性、性能优化、业务逻辑等多个层面。本文将从原理出发,深入探讨Java中处理数据库重复数据的多种策略,并通过完整的代码示例展示如何在实际项目中实现高效的去重方案。
一、数据库重复数据产生的原因及影响
1.1 重复数据产生的常见场景
重复数据通常在以下情况下产生:
用户操作层面:用户由于网络延迟或误操作多次提交表单
系统集成层面:多个系统间数据同步时缺乏幂等性设计
数据迁移层面:ETL过程中未正确处理主键或唯一标识
并发控制层面:高并发场景下缺乏适当的锁机制
1.2 重复数据带来的问题
重复数据会导致:
存储空间浪费:冗余数据占用宝贵的存储资源
查询性能下降:索引膨胀导致查询变慢
业务逻辑错误:统计报表数据不准确
系统资源浪费:重复处理相同数据消耗CPU和内存
二、去重原理与策略
2.1 基于SQL的去重原理
在数据库层面,去重主要依赖以下技术:
DISTINCT关键字:用于查询时去除重复行
GROUP BY子句:按字段分组后处理重复数据
窗口函数:ROW_NUMBER()等高级去重方式
唯一索引约束:从源头防止重复数据插入
2.2 Java应用层去重策略
在Java应用层,我们可以采用:
内存去重:使用Set、Map等数据结构
批量处理:分批次查询和删除
缓存辅助:利用Redis等缓存系统
异步处理:后台任务处理大量重复数据
三、基础去重方案:SQL层面的实现
3.1 使用DISTINCT进行简单去重
-- 查询去重:获取不重复的用户邮箱
SELECT DISTINCT email FROM users;
-- 多字段去重:获取不重复的订单组合
SELECT DISTINCT user_id, product_id, order_date FROM orders;
3.2 使用GROUP BY进行聚合去重
-- 按用户分组,获取每个用户的最新订单
SELECT user_id, MAX(order_date) as latest_order
FROM orders
GROUP BY user_id;
-- 结合聚合函数处理重复数据
SELECT user_id, COUNT(*) as order_count, SUM(amount) as total_amount
FROM orders
GROUP BY user_id
HAVING COUNT(*) > 1; -- 只显示有重复的用户
3.3 使用窗口函数进行高级去重(MySQL 8.0+)
-- 保留每组最新的一条记录
WITH ranked_data AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn
FROM user_behavior
)
DELETE FROM user_behavior
WHERE id IN (
SELECT id FROM ranked_data WHERE rn > 1
);
-- 或者直接查询去重后的数据
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn
FROM user_behavior
) t WHERE rn = 1;
3.4 使用临时表进行高效去重
-- 创建临时表存储去重后的数据
CREATE TEMPORARY TABLE temp_users AS
SELECT DISTINCT * FROM users;
-- 清空原表并重新插入
TRUNCATE TABLE users;
INSERT INTO users SELECT * FROM temp_users;
-- 删除临时表
DROP TEMPORARY TABLE temp_users;
四、Java应用层去重实战
4.1 使用Java Stream API进行内存去重
import java.util.*;
import java.util.stream.Collectors;
public class InMemoryDeduplication {
// 示例:订单去重
public static class Order {
private Long id;
private String orderId;
private String userId;
private BigDecimal amount;
private LocalDateTime orderTime;
// 构造函数、getter、setter省略
// 重写equals和hashCode基于业务唯一标识
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Order order = (Order) o;
return Objects.equals(orderId, order.orderId) &&
Objects.equals(userId, order.userId);
}
@Override
public int hashCode() {
return Objects.hash(orderId, userId);
}
}
/**
* 使用Set进行去重
*/
public List
// 利用Set的特性自动去重(需要正确实现equals和hashCode)
Set
return new ArrayList<>(uniqueOrders);
}
/**
* 使用Stream的distinct方法
*/
public List
return orders.stream()
.distinct()
.collect(Collectors.toList());
}
/**
* 使用Map进行去重(保留最新数据)
*/
public List
Map
for (Order order : orders) {
String key = order.getUserId() + ":" + order.getOrderId();
// 后面的会覆盖前面的,保留最新
orderMap.put(key, order);
}
return new ArrayList<>(orderMap.values());
}
/**
* 复合键去重:使用自定义对象作为键
*/
public List
class OrderKey {
private final String orderId;
private final String userId;
public OrderKey(String orderId, String userId) {
this.orderId = orderId;
this.userId = userId;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
OrderKey orderKey = (OrderKey) o;
return Objects.equals(orderId, orderKey.orderId) &&
Objects.equals(userId, orderKey.userId);
}
@Override
public int hashCode() {
return Objects.hash(orderId, userId);
}
}
Map
for (Order order : orders) {
OrderKey key = new OrderKey(order.getOrderId(), order.getUserId());
map.put(key, order);
}
return new ArrayList<>(map.values());
}
}
4.2 基于数据库查询的Java去重实现
import java.sql.*;
import java.util.*;
public class DatabaseDeduplication {
private Connection connection;
public DatabaseDeduplication(Connection connection) {
this.connection = connection;
}
/**
* 查询并返回去重后的数据
*/
public List
throws SQLException {
String columnList = String.join(", ", columns);
String sql = String.format("SELECT DISTINCT %s FROM %s", columnList, tableName);
List
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
Map
for (String column : columns) {
row.put(column, rs.getObject(column));
}
results.add(row);
}
}
return results;
}
/**
* 批量删除重复数据(保留一条)
*/
public int deleteDuplicateRecords(String tableName, String[] keyColumns) throws SQLException {
StringBuilder sql = new StringBuilder();
sql.append("DELETE FROM ").append(tableName).append(" WHERE id NOT IN (");
sql.append("SELECT MIN(id) FROM ").append(tableName);
sql.append(" GROUP BY ");
sql.append(String.join(", ", keyColumns));
sql.append(")");
try (Statement stmt = connection.createStatement()) {
return stmt.executeUpdate(sql.toString());
}
}
/**
* 使用临时表进行高效去重
*/
public void deduplicateUsingTempTable(String sourceTable, String[] columns) throws SQLException {
String columnList = String.join(", ", columns);
// 创建临时表
String createTemp = String.format(
"CREATE TEMPORARY TABLE temp_%s AS SELECT DISTINCT %s FROM %s",
sourceTable, columnList, sourceTable
);
// 清空原表
String truncate = String.format("TRUNCATE TABLE %s", sourceTable);
// 重新插入
String insertBack = String.format(
"INSERT INTO %s (%s) SELECT %s FROM temp_%s",
sourceTable, columnList, columnList, sourceTable
);
// 删除临时表
String dropTemp = String.format("DROP TEMPORARY TABLE temp_%s", sourceTable);
connection.setAutoCommit(false);
try (Statement stmt = connection.createStatement()) {
stmt.execute(createTemp);
stmt.execute(truncate);
stmt.execute(insertBack);
stmt.execute(dropTemp);
connection.commit();
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
}
4.3 高级去重:处理复杂业务场景
import java.sql.*;
import java.util.*;
import java.util.stream.Collectors;
public class AdvancedDeduplication {
/**
* 保留最新记录的去重策略
*/
public void keepLatestRecords(Connection conn, String tableName,
String[] groupColumns, String timestampColumn) throws SQLException {
String sql = String.format(
"DELETE FROM %s WHERE id NOT IN (" +
" SELECT id FROM (" +
" SELECT id, ROW_NUMBER() OVER (" +
" PARTITION BY %s ORDER BY %s DESC" +
" ) as rn FROM %s" +
" ) t WHERE rn = 1" +
")", tableName, String.join(", ", groupColumns), timestampColumn, tableName);
try (Statement stmt = conn.createStatement()) {
stmt.executeUpdate(sql);
}
}
/**
* 基于业务规则的去重(保留金额最大的记录)
*/
public void keepMaxAmountRecords(Connection conn, String tableName) throws SQLException {
String sql = String.format(
"DELETE FROM %s WHERE id NOT IN (" +
" SELECT id FROM (" +
" SELECT id, ROW_NUMBER() OVER (" +
" PARTITION BY user_id, product_id ORDER BY amount DESC" +
" ) as rn FROM %s" +
" ) t WHERE rn = 1" +
")", tableName, tableName);
try (Statement stmt = conn.createStatement()) {
stmt.executeUpdate(sql);
}
}
/**
* 分批处理大量数据去重
*/
public void batchDeduplicate(Connection conn, String tableName,
String[] keyColumns, int batchSize) throws SQLException {
String groupCols = String.join(", ", keyColumns);
// 分批获取需要删除的ID
String countSql = String.format(
"SELECT COUNT(*) FROM %s WHERE id NOT IN (" +
" SELECT MIN(id) FROM %s GROUP BY %s" +
")", tableName, tableName, groupCols);
int totalToDelete;
try (Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(countSql)) {
rs.next();
totalToDelete = rs.getInt(1);
}
System.out.println("总共需要删除 " + totalToDelete + " 条重复记录");
int deletedCount = 0;
while (deletedCount < totalToDelete) {
String deleteSql = String.format(
"DELETE FROM %s WHERE id IN (" +
" SELECT id FROM %s WHERE id NOT IN (" +
" SELECT MIN(id) FROM %s GROUP BY %s" +
" ) LIMIT %d" +
")", tableName, tableName, tableName, groupCols, batchSize);
try (Statement stmt = conn.createStatement()) {
int batchDeleted = stmt.executeUpdate(deleteSql);
if (batchDeleted == 0) break;
deletedCount += batchDeleted;
System.out.println("已删除: " + deletedCount + " / " + totalToDelete);
// 提交事务,避免长时间锁表
conn.commit();
// 短暂休眠,避免对数据库造成过大压力
try {
Thread.sleep(100);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
}
}
五、预防重复数据的最佳实践
5.1 数据库设计层面
-- 1. 创建唯一索引防止重复插入
ALTER TABLE orders ADD UNIQUE INDEX uk_user_order (user_id, order_id);
-- 2. 使用复合主键
CREATE TABLE user_behavior (
user_id BIGINT,
behavior_type VARCHAR(50),
behavior_time DATETIME,
PRIMARY KEY (user_id, behavior_type, behavior_time)
);
-- 3. 使用CHECK约束(MySQL 8.0.16+)
ALTER TABLE users ADD CONSTRAINT chk_email CHECK (email REGEXP '^[A-Za-z0-9+_.-]+@(.+)$');
5.2 Java应用层预防策略
import java.sql.*;
import java.util.*;
public class PreventionStrategy {
/**
* 使用数据库唯一约束 + Java异常处理
*/
public void insertWithUniqueConstraint(Order order) throws SQLException {
String sql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, order.getUserId());
pstmt.setString(2, order.getOrderId());
pstmt.setBigDecimal(3, order.getAmount());
pstmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));
pstmt.executeUpdate();
} catch (SQLIntegrityConstraintViolationException e) {
// 唯一约束冲突,说明是重复数据
System.out.println("检测到重复订单: " + order.getOrderId());
// 可以选择更新或忽略
updateIfExists(order);
}
}
/**
* 使用INSERT IGNORE或ON DUPLICATE KEY UPDATE
*/
public void insertIgnoreDuplicates(Order order) throws SQLException {
// MySQL语法
String sql = "INSERT IGNORE INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";
// 或者使用ON DUPLICATE KEY UPDATE
// String sql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?) " +
// "ON DUPLICATE KEY UPDATE amount = VALUES(amount), order_time = VALUES(order_time)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, order.getUserId());
pstmt.setString(2, order.getOrderId());
pstmt.setBigDecimal(3, order.getAmount());
pstmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));
int affected = pstmt.executeUpdate();
if (affected == 0) {
System.out.println("订单已存在,未插入: " + order.getOrderId());
}
}
}
/**
* 使用缓存进行幂等性检查(Redis示例)
*/
public boolean isDuplicate(String orderKey, String redisKey) {
// 伪代码:使用Redis SETNX实现分布式锁
// Boolean isNew = redisTemplate.opsForValue().setIfAbsent(orderKey, "1", 30, TimeUnit.MINUTES);
// return !isNew;
return false; // 实际实现需要Redis连接
}
/**
* 幂等性插入:先查询再插入
*/
public void idempotentInsert(Order order) throws SQLException {
// 检查是否已存在
String checkSql = "SELECT COUNT(*) FROM orders WHERE user_id = ? AND order_id = ?";
try (PreparedStatement checkStmt = connection.prepareStatement(checkSql)) {
checkStmt.setString(1, order.getUserId());
checkStmt.setString(2, order.getOrderId());
try (ResultSet rs = checkStmt.executeQuery()) {
if (rs.next() && rs.getInt(1) > 0) {
System.out.println("记录已存在,跳过插入");
return;
}
}
}
// 不存在则插入
String insertSql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";
try (PreparedStatement insertStmt = connection.prepareStatement(insertSql)) {
insertStmt.setString(1, order.getUserId());
insertStmt.setString(2, order.getOrderId());
insertStmt.setBigDecimal(3, order.getAmount());
insertStmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));
insertStmt.executeUpdate();
}
}
}
六、性能优化与监控
6.1 大数据量去重性能优化
import java.sql.*;
import java.util.*;
public class PerformanceOptimization {
/**
* 使用索引优化去重查询
*/
public void createOptimalIndexes(Connection conn) throws SQLException {
String[] indexes = {
// 为去重字段创建索引
"CREATE INDEX idx_user_order ON orders(user_id, order_id)",
"CREATE INDEX idx_timestamp ON orders(order_time)",
// 覆盖索引
"CREATE INDEX idx_covering ON orders(user_id, order_id, amount, order_time)"
};
try (Statement stmt = conn.createStatement()) {
for (String index : indexes) {
try {
stmt.execute(index);
} catch (SQLException e) {
// 索引可能已存在
System.out.println("索引创建失败: " + e.getMessage());
}
}
}
}
/**
* 分析表的重复率
*/
public void analyzeDuplicateRate(Connection conn, String tableName, String[] keyColumns)
throws SQLException {
String groupCols = String.join(", ", keyColumns);
String sql = String.format(
"SELECT " +
" COUNT(*) as total_rows, " +
" COUNT(DISTINCT %s) as unique_rows, " +
" (COUNT(*) - COUNT(DISTINCT %s)) as duplicate_rows, " +
" ROUND((COUNT(*) - COUNT(DISTINCT %s)) * 100.0 / COUNT(*), 2) as duplicate_rate " +
"FROM %s", groupCols, groupCols, groupCols, tableName);
try (Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
if (rs.next()) {
System.out.println("表 " + tableName + " 的重复率分析:");
System.out.println("总行数: " + rs.getLong("total_rows"));
System.out.println("唯一行数: " + rs.getLong("unique_rows"));
System.out.println("重复行数: " + rs.getLong("duplicate_rows"));
System.out.println("重复率: " + rs.getDouble("duplicate_rate") + "%");
}
}
}
/**
* 使用EXPLAIN分析去重查询性能
*/
public void explainDeduplicationQuery(Connection conn, String tableName, String[] keyColumns)
throws SQLException {
String groupCols = String.join(", ", keyColumns);
String sql = String.format(
"EXPLAIN DELETE FROM %s WHERE id NOT IN (" +
" SELECT MIN(id) FROM %s GROUP BY %s" +
")", tableName, tableName, groupCols);
try (Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
System.out.println("查询执行计划:");
while (rs.next()) {
System.out.println("id: " + rs.getObject("id") +
", select_type: " + rs.getObject("select_type") +
", table: " + rs.getObject("table") +
", type: " + rs.getObject("type") +
", rows: " + rs.getObject("rows") +
", Extra: " + rs.getObject("Extra"));
}
}
}
}
6.2 异步去重处理
import java.sql.*;
import java.util.*;
import java.util.concurrent.*;
public class AsyncDeduplication {
private final ExecutorService executor = Executors.newFixedThreadPool(4);
/**
* 异步批量去重
*/
public CompletableFuture
return CompletableFuture.runAsync(() -> {
try {
// 模拟数据库连接(实际项目中应使用连接池)
Connection conn = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/test", "user", "password");
// 分批处理
int batchSize = 1000;
String groupCols = String.join(", ", keyColumns);
String countSql = String.format(
"SELECT COUNT(*) FROM %s WHERE id NOT IN (" +
" SELECT MIN(id) FROM %s GROUP BY %s" +
")", tableName, tableName, groupCols);
int total;
try (Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(countSql)) {
rs.next();
total = rs.getInt(1);
}
int processed = 0;
while (processed < total) {
String deleteSql = String.format(
"DELETE FROM %s WHERE id IN (" +
" SELECT id FROM %s WHERE id NOT IN (" +
" SELECT MIN(id) FROM %s GROUP BY %s" +
" ) LIMIT %d" +
")", tableName, tableName, tableName, groupCols, batchSize);
try (Statement stmt = conn.createStatement()) {
int deleted = stmt.executeUpdate(deleteSql);
if (deleted == 0) break;
processed += deleted;
System.out.printf("线程 %s: 已处理 %d/%d 条记录%n",
Thread.currentThread().getName(), processed, total);
conn.commit();
}
}
conn.close();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}, executor);
}
/**
* 并行处理多个表的去重
*/
public void parallelDeduplicateTables(List
List
.map(table -> asyncDeduplicate(table, keyColumns))
.collect(Collectors.toList());
// 等待所有任务完成
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))
.thenRun(() -> {
System.out.println("所有表去重完成");
executor.shutdown();
})
.exceptionally(ex -> {
System.err.println("去重过程中发生错误: " + ex.getMessage());
executor.shutdown();
return null;
});
}
}
七、完整项目示例:订单系统去重
7.1 实体类定义
import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.Objects;
public class Order {
private Long id;
private String orderId;
private String userId;
private BigDecimal amount;
private LocalDateTime orderTime;
private String status;
public Order() {}
public Order(String orderId, String userId, BigDecimal amount, LocalDateTime orderTime) {
this.orderId = orderId;
this.userId = userId;
this.amount = amount;
this.orderTime = orderTime;
}
// Getter和Setter省略
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Order order = (Order) o;
return Objects.equals(orderId, order.orderId) &&
Objects.equals(userId, order.userId);
}
@Override
public int hashCode() {
return Objects.hash(orderId, userId);
}
@Override
public String toString() {
return "Order{" +
"id=" + id +
", orderId='" + orderId + '\'' +
", userId='" + userId + '\'' +
", amount=" + amount +
", orderTime=" + orderTime +
'}';
}
}
7.2 去重服务类
import java.sql.*;
import java.util.*;
import java.util.stream.Collectors;
public class OrderDeduplicationService {
private final Connection connection;
private static final int BATCH_SIZE = 1000;
public OrderDeduplicationService(Connection connection) {
this.connection = connection;
}
/**
* 方案1:数据库层面去重(推荐用于大数据量)
*/
public int deduplicateInDatabase() throws SQLException {
String sql = "DELETE FROM orders WHERE id NOT IN (" +
" SELECT id FROM (" +
" SELECT id, ROW_NUMBER() OVER (" +
" PARTITION BY user_id, order_id ORDER BY order_time DESC" +
" ) as rn FROM orders" +
" ) t WHERE rn = 1" +
")";
try (Statement stmt = connection.createStatement()) {
int deleted = stmt.executeUpdate(sql);
System.out.println("数据库层面删除重复记录: " + deleted + " 条");
return deleted;
}
}
/**
* 方案2:Java内存去重(适合中小数据量)
*/
public List
// 使用Map保留最新记录
Map
for (Order order : orders) {
String key = order.getUserId() + ":" + order.getOrderId();
Order existing = orderMap.get(key);
// 如果已存在,保留时间最新的
if (existing == null || order.getOrderTime().isAfter(existing.getOrderTime())) {
orderMap.put(key, order);
}
}
return new ArrayList<>(orderMap.values());
}
/**
* 方案3:分批查询+内存去重+批量更新
*/
public void deduplicateBatch() throws SQLException {
// 1. 查询所有可能重复的记录
String querySql = "SELECT * FROM orders WHERE (user_id, order_id) IN (" +
" SELECT user_id, order_id FROM orders " +
" GROUP BY user_id, order_id " +
" HAVING COUNT(*) > 1" +
") ORDER BY user_id, order_id, order_time DESC";
List
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(querySql)) {
while (rs.next()) {
Order order = new Order();
order.setId(rs.getLong("id"));
order.setOrderId(rs.getString("order_id"));
order.setUserId(rs.getString("user_id"));
order.setAmount(rs.getBigDecimal("amount"));
order.setOrderTime(rs.getTimestamp("order_time").toLocalDateTime());
order.setStatus(rs.getString("status"));
allOrders.add(order);
}
}
// 2. 内存去重,保留最新记录
List
// 3. 获取需要删除的ID
Set
.map(Order::getId)
.collect(Collectors.toSet());
// 4. 批量删除重复记录
if (!keepIds.isEmpty()) {
String deleteSql = "DELETE FROM orders WHERE id NOT IN (" +
keepIds.stream()
.map(String::valueOf)
.collect(Collectors.joining(",")) +
")";
try (Statement stmt = connection.createStatement()) {
int deleted = stmt.executeUpdate(deleteSql);
System.out.println("批量删除重复记录: " + deleted + " 条");
}
}
}
/**
* 方案4:使用临时表进行高效去重
*/
public void deduplicateWithTempTable() throws SQLException {
connection.setAutoCommit(false);
try {
// 1. 创建临时表存储去重后的数据
String createTemp = "CREATE TEMPORARY TABLE temp_orders AS " +
"SELECT * FROM orders WHERE id IN (" +
" SELECT id FROM (" +
" SELECT id, ROW_NUMBER() OVER (" +
" PARTITION BY user_id, order_id ORDER BY order_time DESC" +
" ) as rn FROM orders" +
" ) t WHERE rn = 1" +
")";
// 2. 清空原表
String truncate = "TRUNCATE TABLE orders";
// 3. 重新插入
String insertBack = "INSERT INTO orders SELECT * FROM temp_orders";
// 4. 删除临时表
String dropTemp = "DROP TEMPORARY TABLE temp_orders";
try (Statement stmt = connection.createStatement()) {
stmt.execute(createTemp);
stmt.execute(truncate);
stmt.execute(insertBack);
stmt.execute(dropTemp);
}
connection.commit();
System.out.println("临时表方式去重完成");
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
/**
* 验证去重结果
*/
public boolean verifyDeduplication() throws SQLException {
String sql = "SELECT user_id, order_id, COUNT(*) as cnt " +
"FROM orders GROUP BY user_id, order_id " +
"HAVING COUNT(*) > 1";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
if (rs.next()) {
System.out.println("验证失败:仍存在重复记录");
return false;
}
}
System.out.println("验证成功:无重复记录");
return true;
}
}
7.3 主程序示例
import java.sql.*;
import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.*;
public class DeduplicationDemo {
public static void main(String[] args) {
try {
// 1. 获取数据库连接
Connection conn = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/test", "root", "password");
// 2. 创建测试数据
createTestData(conn);
// 3. 执行去重
OrderDeduplicationService service = new OrderDeduplicationService(conn);
// 方式1:数据库层面去重
// service.deduplicateInDatabase();
// 方式2:分批处理
service.deduplicateBatch();
// 方式3:临时表方式
// service.deduplicateWithTempTable();
// 4. 验证结果
service.verifyDeduplication();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
private static void createTestData(Connection conn) throws SQLException {
// 清空表
try (Statement stmt = conn.createStatement()) {
stmt.execute("TRUNCATE TABLE orders");
}
// 插入重复数据
String sql = "INSERT INTO orders (user_id, order_id, amount, order_time, status) VALUES (?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
// 模拟重复数据:每个用户有多个相同订单号
for (int i = 1; i <= 100; i++) {
String userId = "user_" + (i % 10); // 10个用户
String orderId = "order_" + (i % 20); // 20个订单号
pstmt.setString(1, userId);
pstmt.setString(2, orderId);
pstmt.setBigDecimal(3, new BigDecimal("99.99"));
pstmt.setTimestamp(4, Timestamp.valueOf(LocalDateTime.now().minusMinutes(i)));
pstmt.setString(5, "ACTIVE");
pstmt.addBatch();
if (i % 50 == 0) {
pstmt.executeBatch();
}
}
pstmt.executeBatch();
}
System.out.println("测试数据创建完成");
}
}
八、总结与建议
8.1 不同场景下的去重策略选择
场景
推荐策略
优点
缺点
数据量 < 1万
Java内存去重
简单快速
内存占用
数据量 1万-100万
分批处理+内存去重
平衡性能与资源
实现复杂
数据量 > 100万
数据库层面去重
性能最优
可能锁表
实时系统
唯一索引+幂等性设计
预防为主
需要前期设计
8.2 关键要点总结
预防优于治疗:通过数据库约束和应用层幂等性设计预防重复
选择合适的工具:根据数据量和业务需求选择去重策略
性能监控:定期分析重复率,优化索引和查询
数据备份:去重操作前务必备份数据
分批处理:大数据量时采用分批策略避免长时间锁表
8.3 生产环境建议
测试环境验证:先在测试环境完整测试去重逻辑
灰度执行:生产环境分批次执行,监控性能影响
事务保护:确保去重操作的原子性
日志记录:详细记录去重过程,便于问题追溯
回滚方案:准备数据恢复方案以应对意外情况
通过本文的详细讲解和完整代码示例,相信您已经掌握了Java数据库去重的核心技术和实战技巧。在实际项目中,请根据具体业务场景和数据规模选择最合适的方案,确保数据质量和系统性能的双重保障。
