Login

My DB had millions of duplicate records. @somnath's answer did not work as is so writing the solution that worked for me for people looking to delete millions of duplicate records.

```js
/** Create a array to store all duplicate records ids*/
var duplicates = [];

/** Start Aggregation pipeline*/
db.collection.aggregate([
{
$match: { /** Add any filter here. Add index for filter keys*/
filterKey: {
$exists: false
}
}
},
{
$sort: { /** Sort it in such a way that you want to retain first element*/
createdAt: -1
}
},
{
$group: {
_id: {
key1: "$key1", key2:"$key2" /** These are the keys which define the duplicate. Here document with same value for key1 and key2 will be considered duplicate*/
},
dups: {
$push: {
_id: "$_id"
}
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
"$gt": 1
}
}
}
],
{
allowDiskUse: true
}).forEach(function(doc){
doc.dups.shift();
doc.dups.forEach(function(dupId){
duplicates.push(dupId._id);
})
})

/** Delete the duplicates*/
var i,j,temparray,chunk = 100000;
for (i=0,j=duplicates.length; i<j; i+=chunk) {
temparray = duplicates.slice(i,i+chunk);
db.collection.bulkWrite([{deleteMany:{"filter":{"_id":{"$in":temparray}}}}])
}

```

First, you can find all the duplicates and remove those duplicates in the DB. Here we take the id column to check and remove duplicates.

db.collection.aggregate([
{ "$group": { "_id": "$id", "count": { "$sum": 1 } } },
{ "$match": { "_id": { "$ne": null }, "count": { "$gt": 1 } } },
{ "$sort": { "count": -1 } },
{ "$project": { "name": "$_id", "_id": 0 } }
]).then(data => {
var dr = data.map(d => d.name);
console.log("duplicate Recods:: ", dr);
db.collection.remove({ id: { $in: dr } }).then(removedD => {
console.log("Removed duplicate Data:: ", removedD);
})
})

tips to speed up, when only small portion of your documents are duplicated:

1. you need an index on the field to detect duplicates.
2. $group does not use the index, but it can take advantage of $sort and $sort use the index. so you should put a $sort step at the beginning
3. do inplace delete_many() instead of $out to new collection, this will save lots of IO time and disk space.

if you use pymongo you can do:
```python
index_uuid = IndexModel(
[
('uuid', pymongo.ASCENDING)
],
)
col.create_indexes([index_uuid])
pipeline = [
{"$sort": {"uuid":1}},
{
"$group": {
"_id": "$uuid",
"dups": {"$addToSet": "$_id"},
"count": {"$sum": 1}
}
},
{
"$match": {"count": {"$gt": 1}}
},
]
it_cursor = col.aggregate(
pipeline, allowDiskUse=True
)
# skip 1st dup of each dups group
dups = list(itertools.chain.from_iterable(map(lambda x: x["dups"][1:], it_cursor)))
col.delete_many({"_id":{"$in": dups}})
```

### performance

I test it on a database contain 30M documents and 1TB large.
- Without index/sort it takes more than an hour to get the cursor (I do not even have the patient to wait for it).
- with index/sort but use $out to output to a new collection. This is safer if your filesystem does not support snapshot. But it requires lots of disk space and takes more than 40mins to finish despite the fact that we are using SSDs. It will be much slower if you are on HDD RAID.
- with index/sort and inplace delete_many, it takes around 5mins in total.

The following Mongo aggregation pipeline does the deduplication and outputs it back to the same or different collection.





collection.aggregate([
{ $group: {
_id: '$field_to_dedup',
doc: { $first: '$$ROOT' }
} },
{ $replaceRoot: {
newRoot: '$doc'
} },
{ $out: 'collection' }
], { allowDiskUse: true })

I had to remove 3M duplicate record and i did the following things in Mongo shell

1. `use your_DB_name`
2.

> for (var i = 1; i <= 30; i++) {
> var data = db.collectionName.aggregate([
> {"$group" : { "_id":"$yourGroupById" , "count": { "$sum": 1 },"data": { "$push": "$$ROOT" }}},
> {
> $project:
> {
> result: { $slice: [ "$data", { $subtract: [{ $size: "$data" }, 1] } ] },
> count:"$count"
> }
> },{"$unwind": "$result"},{ $limit : 100000 }
> ],{allowDiskUse:true}).toArray()
>
> data = data.map(r=>r.result._id)
> db.collectionName.deleteMany({_id:{$in:data}})
> data = []
> }

slow908

mercerizercvkrpjslsv

tenebrific197

Mrharishvsryt

hydnaceae638415