Java数据库去重实战指南:从原理到代码教你高效处理重复数据并优化存储性能

引言:理解数据库去重的重要性

在现代软件开发中,数据重复是一个常见且棘手的问题。无论是用户意外提交的重复订单、系统同步产生的冗余记录,还是数据迁移过程中导入的重复数据,都会严重影响数据库的性能和数据的准确性。作为Java开发者,掌握高效的去重技术不仅能提升应用性能,还能确保数据的一致性。

数据库去重不仅仅是简单的删除重复行,它涉及到数据完整性、性能优化、业务逻辑等多个层面。本文将从原理出发,深入探讨Java中处理数据库重复数据的多种策略,并通过完整的代码示例展示如何在实际项目中实现高效的去重方案。

一、数据库重复数据产生的原因及影响

1.1 重复数据产生的常见场景

重复数据通常在以下情况下产生:

用户操作层面:用户由于网络延迟或误操作多次提交表单

系统集成层面:多个系统间数据同步时缺乏幂等性设计

数据迁移层面:ETL过程中未正确处理主键或唯一标识

并发控制层面:高并发场景下缺乏适当的锁机制

1.2 重复数据带来的问题

重复数据会导致:

存储空间浪费:冗余数据占用宝贵的存储资源

查询性能下降:索引膨胀导致查询变慢

业务逻辑错误:统计报表数据不准确

系统资源浪费:重复处理相同数据消耗CPU和内存

二、去重原理与策略

2.1 基于SQL的去重原理

在数据库层面,去重主要依赖以下技术:

DISTINCT关键字:用于查询时去除重复行

GROUP BY子句:按字段分组后处理重复数据

窗口函数:ROW_NUMBER()等高级去重方式

唯一索引约束:从源头防止重复数据插入

2.2 Java应用层去重策略

在Java应用层,我们可以采用:

内存去重:使用Set、Map等数据结构

批量处理:分批次查询和删除

缓存辅助:利用Redis等缓存系统

异步处理:后台任务处理大量重复数据

三、基础去重方案:SQL层面的实现

3.1 使用DISTINCT进行简单去重

-- 查询去重:获取不重复的用户邮箱

SELECT DISTINCT email FROM users;

-- 多字段去重:获取不重复的订单组合

SELECT DISTINCT user_id, product_id, order_date FROM orders;

3.2 使用GROUP BY进行聚合去重

-- 按用户分组,获取每个用户的最新订单

SELECT user_id, MAX(order_date) as latest_order

FROM orders

GROUP BY user_id;

-- 结合聚合函数处理重复数据

SELECT user_id, COUNT(*) as order_count, SUM(amount) as total_amount

FROM orders

GROUP BY user_id

HAVING COUNT(*) > 1; -- 只显示有重复的用户

3.3 使用窗口函数进行高级去重(MySQL 8.0+)

-- 保留每组最新的一条记录

WITH ranked_data AS (

SELECT *,

ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn

FROM user_behavior

)

DELETE FROM user_behavior

WHERE id IN (

SELECT id FROM ranked_data WHERE rn > 1

);

-- 或者直接查询去重后的数据

SELECT * FROM (

SELECT *,

ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn

FROM user_behavior

) t WHERE rn = 1;

3.4 使用临时表进行高效去重

-- 创建临时表存储去重后的数据

CREATE TEMPORARY TABLE temp_users AS

SELECT DISTINCT * FROM users;

-- 清空原表并重新插入

TRUNCATE TABLE users;

INSERT INTO users SELECT * FROM temp_users;

-- 删除临时表

DROP TEMPORARY TABLE temp_users;

四、Java应用层去重实战

4.1 使用Java Stream API进行内存去重

import java.util.*;

import java.util.stream.Collectors;

public class InMemoryDeduplication {

// 示例:订单去重

public static class Order {

private Long id;

private String orderId;

private String userId;

private BigDecimal amount;

private LocalDateTime orderTime;

// 构造函数、getter、setter省略

// 重写equals和hashCode基于业务唯一标识

@Override

public boolean equals(Object o) {

if (this == o) return true;

if (o == null || getClass() != o.getClass()) return false;

Order order = (Order) o;

return Objects.equals(orderId, order.orderId) &&

Objects.equals(userId, order.userId);

}

@Override

public int hashCode() {

return Objects.hash(orderId, userId);

}

}

/**

* 使用Set进行去重

*/

public List deduplicateUsingSet(List orders) {

// 利用Set的特性自动去重(需要正确实现equals和hashCode)

Set uniqueOrders = new HashSet<>(orders);

return new ArrayList<>(uniqueOrders);

}

/**

* 使用Stream的distinct方法

*/

public List deduplicateUsingStream(List orders) {

return orders.stream()

.distinct()

.collect(Collectors.toList());

}

/**

* 使用Map进行去重(保留最新数据)

*/

public List deduplicateUsingMap(List orders) {

Map orderMap = new HashMap<>();

for (Order order : orders) {

String key = order.getUserId() + ":" + order.getOrderId();

// 后面的会覆盖前面的,保留最新

orderMap.put(key, order);

}

return new ArrayList<>(orderMap.values());

}

/**

* 复合键去重:使用自定义对象作为键

*/

public List deduplicateWithComplexKey(List orders) {

class OrderKey {

private final String orderId;

private final String userId;

public OrderKey(String orderId, String userId) {

this.orderId = orderId;

this.userId = userId;

}

@Override

public boolean equals(Object o) {

if (this == o) return true;

if (o == null || getClass() != o.getClass()) return false;

OrderKey orderKey = (OrderKey) o;

return Objects.equals(orderId, orderKey.orderId) &&

Objects.equals(userId, orderKey.userId);

}

@Override

public int hashCode() {

return Objects.hash(orderId, userId);

}

}

Map map = new HashMap<>();

for (Order order : orders) {

OrderKey key = new OrderKey(order.getOrderId(), order.getUserId());

map.put(key, order);

}

return new ArrayList<>(map.values());

}

}

4.2 基于数据库查询的Java去重实现

import java.sql.*;

import java.util.*;

public class DatabaseDeduplication {

private Connection connection;

public DatabaseDeduplication(Connection connection) {

this.connection = connection;

}

/**

* 查询并返回去重后的数据

*/

public List> getDistinctData(String tableName, String[] columns)

throws SQLException {

String columnList = String.join(", ", columns);

String sql = String.format("SELECT DISTINCT %s FROM %s", columnList, tableName);

List> results = new ArrayList<>();

try (Statement stmt = connection.createStatement();

ResultSet rs = stmt.executeQuery(sql)) {

while (rs.next()) {

Map row = new HashMap<>();

for (String column : columns) {

row.put(column, rs.getObject(column));

}

results.add(row);

}

}

return results;

}

/**

* 批量删除重复数据(保留一条)

*/

public int deleteDuplicateRecords(String tableName, String[] keyColumns) throws SQLException {

StringBuilder sql = new StringBuilder();

sql.append("DELETE FROM ").append(tableName).append(" WHERE id NOT IN (");

sql.append("SELECT MIN(id) FROM ").append(tableName);

sql.append(" GROUP BY ");

sql.append(String.join(", ", keyColumns));

sql.append(")");

try (Statement stmt = connection.createStatement()) {

return stmt.executeUpdate(sql.toString());

}

}

/**

* 使用临时表进行高效去重

*/

public void deduplicateUsingTempTable(String sourceTable, String[] columns) throws SQLException {

String columnList = String.join(", ", columns);

// 创建临时表

String createTemp = String.format(

"CREATE TEMPORARY TABLE temp_%s AS SELECT DISTINCT %s FROM %s",

sourceTable, columnList, sourceTable

);

// 清空原表

String truncate = String.format("TRUNCATE TABLE %s", sourceTable);

// 重新插入

String insertBack = String.format(

"INSERT INTO %s (%s) SELECT %s FROM temp_%s",

sourceTable, columnList, columnList, sourceTable

);

// 删除临时表

String dropTemp = String.format("DROP TEMPORARY TABLE temp_%s", sourceTable);

connection.setAutoCommit(false);

try (Statement stmt = connection.createStatement()) {

stmt.execute(createTemp);

stmt.execute(truncate);

stmt.execute(insertBack);

stmt.execute(dropTemp);

connection.commit();

} catch (SQLException e) {

connection.rollback();

throw e;

} finally {

connection.setAutoCommit(true);

}

}

}

4.3 高级去重:处理复杂业务场景

import java.sql.*;

import java.util.*;

import java.util.stream.Collectors;

public class AdvancedDeduplication {

/**

* 保留最新记录的去重策略

*/

public void keepLatestRecords(Connection conn, String tableName,

String[] groupColumns, String timestampColumn) throws SQLException {

String sql = String.format(

"DELETE FROM %s WHERE id NOT IN (" +

" SELECT id FROM (" +

" SELECT id, ROW_NUMBER() OVER (" +

" PARTITION BY %s ORDER BY %s DESC" +

" ) as rn FROM %s" +

" ) t WHERE rn = 1" +

")", tableName, String.join(", ", groupColumns), timestampColumn, tableName);

try (Statement stmt = conn.createStatement()) {

stmt.executeUpdate(sql);

}

}

/**

* 基于业务规则的去重(保留金额最大的记录)

*/

public void keepMaxAmountRecords(Connection conn, String tableName) throws SQLException {

String sql = String.format(

"DELETE FROM %s WHERE id NOT IN (" +

" SELECT id FROM (" +

" SELECT id, ROW_NUMBER() OVER (" +

" PARTITION BY user_id, product_id ORDER BY amount DESC" +

" ) as rn FROM %s" +

" ) t WHERE rn = 1" +

")", tableName, tableName);

try (Statement stmt = conn.createStatement()) {

stmt.executeUpdate(sql);

}

}

/**

* 分批处理大量数据去重

*/

public void batchDeduplicate(Connection conn, String tableName,

String[] keyColumns, int batchSize) throws SQLException {

String groupCols = String.join(", ", keyColumns);

// 分批获取需要删除的ID

String countSql = String.format(

"SELECT COUNT(*) FROM %s WHERE id NOT IN (" +

" SELECT MIN(id) FROM %s GROUP BY %s" +

")", tableName, tableName, groupCols);

int totalToDelete;

try (Statement stmt = conn.createStatement();

ResultSet rs = stmt.executeQuery(countSql)) {

rs.next();

totalToDelete = rs.getInt(1);

}

System.out.println("总共需要删除 " + totalToDelete + " 条重复记录");

int deletedCount = 0;

while (deletedCount < totalToDelete) {

String deleteSql = String.format(

"DELETE FROM %s WHERE id IN (" +

" SELECT id FROM %s WHERE id NOT IN (" +

" SELECT MIN(id) FROM %s GROUP BY %s" +

" ) LIMIT %d" +

")", tableName, tableName, tableName, groupCols, batchSize);

try (Statement stmt = conn.createStatement()) {

int batchDeleted = stmt.executeUpdate(deleteSql);

if (batchDeleted == 0) break;

deletedCount += batchDeleted;

System.out.println("已删除: " + deletedCount + " / " + totalToDelete);

// 提交事务,避免长时间锁表

conn.commit();

// 短暂休眠,避免对数据库造成过大压力

try {

Thread.sleep(100);

} catch (InterruptedException e) {

Thread.currentThread().interrupt();

break;

}

}

}

}

}

五、预防重复数据的最佳实践

5.1 数据库设计层面

-- 1. 创建唯一索引防止重复插入

ALTER TABLE orders ADD UNIQUE INDEX uk_user_order (user_id, order_id);

-- 2. 使用复合主键

CREATE TABLE user_behavior (

user_id BIGINT,

behavior_type VARCHAR(50),

behavior_time DATETIME,

PRIMARY KEY (user_id, behavior_type, behavior_time)

);

-- 3. 使用CHECK约束(MySQL 8.0.16+)

ALTER TABLE users ADD CONSTRAINT chk_email CHECK (email REGEXP '^[A-Za-z0-9+_.-]+@(.+)$');

5.2 Java应用层预防策略

import java.sql.*;

import java.util.*;

public class PreventionStrategy {

/**

* 使用数据库唯一约束 + Java异常处理

*/

public void insertWithUniqueConstraint(Order order) throws SQLException {

String sql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";

try (PreparedStatement pstmt = connection.prepareStatement(sql)) {

pstmt.setString(1, order.getUserId());

pstmt.setString(2, order.getOrderId());

pstmt.setBigDecimal(3, order.getAmount());

pstmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));

pstmt.executeUpdate();

} catch (SQLIntegrityConstraintViolationException e) {

// 唯一约束冲突,说明是重复数据

System.out.println("检测到重复订单: " + order.getOrderId());

// 可以选择更新或忽略

updateIfExists(order);

}

}

/**

* 使用INSERT IGNORE或ON DUPLICATE KEY UPDATE

*/

public void insertIgnoreDuplicates(Order order) throws SQLException {

// MySQL语法

String sql = "INSERT IGNORE INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";

// 或者使用ON DUPLICATE KEY UPDATE

// String sql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?) " +

// "ON DUPLICATE KEY UPDATE amount = VALUES(amount), order_time = VALUES(order_time)";

try (PreparedStatement pstmt = connection.prepareStatement(sql)) {

pstmt.setString(1, order.getUserId());

pstmt.setString(2, order.getOrderId());

pstmt.setBigDecimal(3, order.getAmount());

pstmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));

int affected = pstmt.executeUpdate();

if (affected == 0) {

System.out.println("订单已存在,未插入: " + order.getOrderId());

}

}

}

/**

* 使用缓存进行幂等性检查(Redis示例)

*/

public boolean isDuplicate(String orderKey, String redisKey) {

// 伪代码:使用Redis SETNX实现分布式锁

// Boolean isNew = redisTemplate.opsForValue().setIfAbsent(orderKey, "1", 30, TimeUnit.MINUTES);

// return !isNew;

return false; // 实际实现需要Redis连接

}

/**

* 幂等性插入:先查询再插入

*/

public void idempotentInsert(Order order) throws SQLException {

// 检查是否已存在

String checkSql = "SELECT COUNT(*) FROM orders WHERE user_id = ? AND order_id = ?";

try (PreparedStatement checkStmt = connection.prepareStatement(checkSql)) {

checkStmt.setString(1, order.getUserId());

checkStmt.setString(2, order.getOrderId());

try (ResultSet rs = checkStmt.executeQuery()) {

if (rs.next() && rs.getInt(1) > 0) {

System.out.println("记录已存在,跳过插入");

return;

}

}

}

// 不存在则插入

String insertSql = "INSERT INTO orders (user_id, order_id, amount, order_time) VALUES (?, ?, ?, ?)";

try (PreparedStatement insertStmt = connection.prepareStatement(insertSql)) {

insertStmt.setString(1, order.getUserId());

insertStmt.setString(2, order.getOrderId());

insertStmt.setBigDecimal(3, order.getAmount());

insertStmt.setTimestamp(4, Timestamp.valueOf(order.getOrderTime()));

insertStmt.executeUpdate();

}

}

}

六、性能优化与监控

6.1 大数据量去重性能优化

import java.sql.*;

import java.util.*;

public class PerformanceOptimization {

/**

* 使用索引优化去重查询

*/

public void createOptimalIndexes(Connection conn) throws SQLException {

String[] indexes = {

// 为去重字段创建索引

"CREATE INDEX idx_user_order ON orders(user_id, order_id)",

"CREATE INDEX idx_timestamp ON orders(order_time)",

// 覆盖索引

"CREATE INDEX idx_covering ON orders(user_id, order_id, amount, order_time)"

};

try (Statement stmt = conn.createStatement()) {

for (String index : indexes) {

try {

stmt.execute(index);

} catch (SQLException e) {

// 索引可能已存在

System.out.println("索引创建失败: " + e.getMessage());

}

}

}

}

/**

* 分析表的重复率

*/

public void analyzeDuplicateRate(Connection conn, String tableName, String[] keyColumns)

throws SQLException {

String groupCols = String.join(", ", keyColumns);

String sql = String.format(

"SELECT " +

" COUNT(*) as total_rows, " +

" COUNT(DISTINCT %s) as unique_rows, " +

" (COUNT(*) - COUNT(DISTINCT %s)) as duplicate_rows, " +

" ROUND((COUNT(*) - COUNT(DISTINCT %s)) * 100.0 / COUNT(*), 2) as duplicate_rate " +

"FROM %s", groupCols, groupCols, groupCols, tableName);

try (Statement stmt = conn.createStatement();

ResultSet rs = stmt.executeQuery(sql)) {

if (rs.next()) {

System.out.println("表 " + tableName + " 的重复率分析:");

System.out.println("总行数: " + rs.getLong("total_rows"));

System.out.println("唯一行数: " + rs.getLong("unique_rows"));

System.out.println("重复行数: " + rs.getLong("duplicate_rows"));

System.out.println("重复率: " + rs.getDouble("duplicate_rate") + "%");

}

}

}

/**

* 使用EXPLAIN分析去重查询性能

*/

public void explainDeduplicationQuery(Connection conn, String tableName, String[] keyColumns)

throws SQLException {

String groupCols = String.join(", ", keyColumns);

String sql = String.format(

"EXPLAIN DELETE FROM %s WHERE id NOT IN (" +

" SELECT MIN(id) FROM %s GROUP BY %s" +

")", tableName, tableName, groupCols);

try (Statement stmt = conn.createStatement();

ResultSet rs = stmt.executeQuery(sql)) {

System.out.println("查询执行计划:");

while (rs.next()) {

System.out.println("id: " + rs.getObject("id") +

", select_type: " + rs.getObject("select_type") +

", table: " + rs.getObject("table") +

", type: " + rs.getObject("type") +

", rows: " + rs.getObject("rows") +

", Extra: " + rs.getObject("Extra"));

}

}

}

}

6.2 异步去重处理

import java.sql.*;

import java.util.*;

import java.util.concurrent.*;

public class AsyncDeduplication {

private final ExecutorService executor = Executors.newFixedThreadPool(4);

/**

* 异步批量去重

*/

public CompletableFuture asyncDeduplicate(String tableName, String[] keyColumns) {

return CompletableFuture.runAsync(() -> {

try {

// 模拟数据库连接(实际项目中应使用连接池)

Connection conn = DriverManager.getConnection(

"jdbc:mysql://localhost:3306/test", "user", "password");

// 分批处理

int batchSize = 1000;

String groupCols = String.join(", ", keyColumns);

String countSql = String.format(

"SELECT COUNT(*) FROM %s WHERE id NOT IN (" +

" SELECT MIN(id) FROM %s GROUP BY %s" +

")", tableName, tableName, groupCols);

int total;

try (Statement stmt = conn.createStatement();

ResultSet rs = stmt.executeQuery(countSql)) {

rs.next();

total = rs.getInt(1);

}

int processed = 0;

while (processed < total) {

String deleteSql = String.format(

"DELETE FROM %s WHERE id IN (" +

" SELECT id FROM %s WHERE id NOT IN (" +

" SELECT MIN(id) FROM %s GROUP BY %s" +

" ) LIMIT %d" +

")", tableName, tableName, tableName, groupCols, batchSize);

try (Statement stmt = conn.createStatement()) {

int deleted = stmt.executeUpdate(deleteSql);

if (deleted == 0) break;

processed += deleted;

System.out.printf("线程 %s: 已处理 %d/%d 条记录%n",

Thread.currentThread().getName(), processed, total);

conn.commit();

}

}

conn.close();

} catch (SQLException e) {

throw new RuntimeException(e);

}

}, executor);

}

/**

* 并行处理多个表的去重

*/

public void parallelDeduplicateTables(List tables, String[] keyColumns) {

List> futures = tables.stream()

.map(table -> asyncDeduplicate(table, keyColumns))

.collect(Collectors.toList());

// 等待所有任务完成

CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))

.thenRun(() -> {

System.out.println("所有表去重完成");

executor.shutdown();

})

.exceptionally(ex -> {

System.err.println("去重过程中发生错误: " + ex.getMessage());

executor.shutdown();

return null;

});

}

}

七、完整项目示例:订单系统去重

7.1 实体类定义

import java.math.BigDecimal;

import java.time.LocalDateTime;

import java.util.Objects;

public class Order {

private Long id;

private String orderId;

private String userId;

private BigDecimal amount;

private LocalDateTime orderTime;

private String status;

public Order() {}

public Order(String orderId, String userId, BigDecimal amount, LocalDateTime orderTime) {

this.orderId = orderId;

this.userId = userId;

this.amount = amount;

this.orderTime = orderTime;

}

// Getter和Setter省略

@Override

public boolean equals(Object o) {

if (this == o) return true;

if (o == null || getClass() != o.getClass()) return false;

Order order = (Order) o;

return Objects.equals(orderId, order.orderId) &&

Objects.equals(userId, order.userId);

}

@Override

public int hashCode() {

return Objects.hash(orderId, userId);

}

@Override

public String toString() {

return "Order{" +

"id=" + id +

", orderId='" + orderId + '\'' +

", userId='" + userId + '\'' +

", amount=" + amount +

", orderTime=" + orderTime +

'}';

}

}

7.2 去重服务类

import java.sql.*;

import java.util.*;

import java.util.stream.Collectors;

public class OrderDeduplicationService {

private final Connection connection;

private static final int BATCH_SIZE = 1000;

public OrderDeduplicationService(Connection connection) {

this.connection = connection;

}

/**

* 方案1:数据库层面去重(推荐用于大数据量)

*/

public int deduplicateInDatabase() throws SQLException {

String sql = "DELETE FROM orders WHERE id NOT IN (" +

" SELECT id FROM (" +

" SELECT id, ROW_NUMBER() OVER (" +

" PARTITION BY user_id, order_id ORDER BY order_time DESC" +

" ) as rn FROM orders" +

" ) t WHERE rn = 1" +

")";

try (Statement stmt = connection.createStatement()) {

int deleted = stmt.executeUpdate(sql);

System.out.println("数据库层面删除重复记录: " + deleted + " 条");

return deleted;

}

}

/**

* 方案2:Java内存去重(适合中小数据量)

*/

public List deduplicateInMemory(List orders) {

// 使用Map保留最新记录

Map orderMap = new HashMap<>();

for (Order order : orders) {

String key = order.getUserId() + ":" + order.getOrderId();

Order existing = orderMap.get(key);

// 如果已存在,保留时间最新的

if (existing == null || order.getOrderTime().isAfter(existing.getOrderTime())) {

orderMap.put(key, order);

}

}

return new ArrayList<>(orderMap.values());

}

/**

* 方案3:分批查询+内存去重+批量更新

*/

public void deduplicateBatch() throws SQLException {

// 1. 查询所有可能重复的记录

String querySql = "SELECT * FROM orders WHERE (user_id, order_id) IN (" +

" SELECT user_id, order_id FROM orders " +

" GROUP BY user_id, order_id " +

" HAVING COUNT(*) > 1" +

") ORDER BY user_id, order_id, order_time DESC";

List allOrders = new ArrayList<>();

try (Statement stmt = connection.createStatement();

ResultSet rs = stmt.executeQuery(querySql)) {

while (rs.next()) {

Order order = new Order();

order.setId(rs.getLong("id"));

order.setOrderId(rs.getString("order_id"));

order.setUserId(rs.getString("user_id"));

order.setAmount(rs.getBigDecimal("amount"));

order.setOrderTime(rs.getTimestamp("order_time").toLocalDateTime());

order.setStatus(rs.getString("status"));

allOrders.add(order);

}

}

// 2. 内存去重,保留最新记录

List uniqueOrders = deduplicateInMemory(allOrders);

// 3. 获取需要删除的ID

Set keepIds = uniqueOrders.stream()

.map(Order::getId)

.collect(Collectors.toSet());

// 4. 批量删除重复记录

if (!keepIds.isEmpty()) {

String deleteSql = "DELETE FROM orders WHERE id NOT IN (" +

keepIds.stream()

.map(String::valueOf)

.collect(Collectors.joining(",")) +

")";

try (Statement stmt = connection.createStatement()) {

int deleted = stmt.executeUpdate(deleteSql);

System.out.println("批量删除重复记录: " + deleted + " 条");

}

}

}

/**

* 方案4:使用临时表进行高效去重

*/

public void deduplicateWithTempTable() throws SQLException {

connection.setAutoCommit(false);

try {

// 1. 创建临时表存储去重后的数据

String createTemp = "CREATE TEMPORARY TABLE temp_orders AS " +

"SELECT * FROM orders WHERE id IN (" +

" SELECT id FROM (" +

" SELECT id, ROW_NUMBER() OVER (" +

" PARTITION BY user_id, order_id ORDER BY order_time DESC" +

" ) as rn FROM orders" +

" ) t WHERE rn = 1" +

")";

// 2. 清空原表

String truncate = "TRUNCATE TABLE orders";

// 3. 重新插入

String insertBack = "INSERT INTO orders SELECT * FROM temp_orders";

// 4. 删除临时表

String dropTemp = "DROP TEMPORARY TABLE temp_orders";

try (Statement stmt = connection.createStatement()) {

stmt.execute(createTemp);

stmt.execute(truncate);

stmt.execute(insertBack);

stmt.execute(dropTemp);

}

connection.commit();

System.out.println("临时表方式去重完成");

} catch (SQLException e) {

connection.rollback();

throw e;

} finally {

connection.setAutoCommit(true);

}

}

/**

* 验证去重结果

*/

public boolean verifyDeduplication() throws SQLException {

String sql = "SELECT user_id, order_id, COUNT(*) as cnt " +

"FROM orders GROUP BY user_id, order_id " +

"HAVING COUNT(*) > 1";

try (Statement stmt = connection.createStatement();

ResultSet rs = stmt.executeQuery(sql)) {

if (rs.next()) {

System.out.println("验证失败:仍存在重复记录");

return false;

}

}

System.out.println("验证成功:无重复记录");

return true;

}

}

7.3 主程序示例

import java.sql.*;

import java.math.BigDecimal;

import java.time.LocalDateTime;

import java.util.*;

public class DeduplicationDemo {

public static void main(String[] args) {

try {

// 1. 获取数据库连接

Connection conn = DriverManager.getConnection(

"jdbc:mysql://localhost:3306/test", "root", "password");

// 2. 创建测试数据

createTestData(conn);

// 3. 执行去重

OrderDeduplicationService service = new OrderDeduplicationService(conn);

// 方式1:数据库层面去重

// service.deduplicateInDatabase();

// 方式2:分批处理

service.deduplicateBatch();

// 方式3:临时表方式

// service.deduplicateWithTempTable();

// 4. 验证结果

service.verifyDeduplication();

conn.close();

} catch (SQLException e) {

e.printStackTrace();

}

}

private static void createTestData(Connection conn) throws SQLException {

// 清空表

try (Statement stmt = conn.createStatement()) {

stmt.execute("TRUNCATE TABLE orders");

}

// 插入重复数据

String sql = "INSERT INTO orders (user_id, order_id, amount, order_time, status) VALUES (?, ?, ?, ?, ?)";

try (PreparedStatement pstmt = conn.prepareStatement(sql)) {

// 模拟重复数据:每个用户有多个相同订单号

for (int i = 1; i <= 100; i++) {

String userId = "user_" + (i % 10); // 10个用户

String orderId = "order_" + (i % 20); // 20个订单号

pstmt.setString(1, userId);

pstmt.setString(2, orderId);

pstmt.setBigDecimal(3, new BigDecimal("99.99"));

pstmt.setTimestamp(4, Timestamp.valueOf(LocalDateTime.now().minusMinutes(i)));

pstmt.setString(5, "ACTIVE");

pstmt.addBatch();

if (i % 50 == 0) {

pstmt.executeBatch();

}

}

pstmt.executeBatch();

}

System.out.println("测试数据创建完成");

}

}

八、总结与建议

8.1 不同场景下的去重策略选择

场景

推荐策略

优点

缺点

数据量 < 1万

Java内存去重

简单快速

内存占用

数据量 1万-100万

分批处理+内存去重

平衡性能与资源

实现复杂

数据量 > 100万

数据库层面去重

性能最优

可能锁表

实时系统

唯一索引+幂等性设计

预防为主

需要前期设计

8.2 关键要点总结

预防优于治疗:通过数据库约束和应用层幂等性设计预防重复

选择合适的工具:根据数据量和业务需求选择去重策略

性能监控:定期分析重复率,优化索引和查询

数据备份:去重操作前务必备份数据

分批处理:大数据量时采用分批策略避免长时间锁表

8.3 生产环境建议

测试环境验证:先在测试环境完整测试去重逻辑

灰度执行:生产环境分批次执行,监控性能影响

事务保护:确保去重操作的原子性

日志记录:详细记录去重过程,便于问题追溯

回滚方案:准备数据恢复方案以应对意外情况

通过本文的详细讲解和完整代码示例,相信您已经掌握了Java数据库去重的核心技术和实战技巧。在实际项目中,请根据具体业务场景和数据规模选择最合适的方案,确保数据质量和系统性能的双重保障。

Copyright © 2088 02年世界杯中国队_1930年乌拉圭世界杯 - n360l.com All Rights Reserved.
友情链接