Delete single quotes, double quotes, commas, line breaks and records with the same value as a field in mongodb

Asked

Viewed 108 times

0

I have a collection on Mongodb of Tweets, these records have a field called text, in this field I need to delete records that have the same value besides removing single quotes, doubles, commas and line breaks. For the removal of data with duplicated text field I am trying as follows:

var registro;
db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {
    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {
        registro = db.getCollection('TweetsBR_1_copy').findOne({text:myDoc_2.text})
        db.getCollection('TweetsBR_1_copy').remove(registro)
        print("registro excluido:")
        print(registro.text)
    });
    db.getCollection('TweetsBR_1_copy').insert(registro)
    print("registro inserido:")
    print(registro.text)
});

But I am noticing that every time I run the command it deletes more and more records, so I’m not sure it’s working properly. The collection has around 500K of records.

Can someone give me a hand in this matter?

Thank you.

1 answer

1

I’ll copy your code and comment to try to understand what you’re doing and help you:

var registro;

// para cada documento na collection "TweetsBR_1_copy"

db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {

    // vou buscar na mesma collection um (ou vários) documento(s) que tenha o mesmo valor do campo "text"

    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {

        // buscar novamente, só que somente um documento dessa vez

        registro = db.getCollection('TweetsBR_1_copy').findOne({text:myDoc_2.text})

        // excluir o documento da collection

        db.getCollection('TweetsBR_1_copy').remove(registro)
        print("registro excluido:")
        print(registro.text)
    });

    // inserir novamente o documento na collection, isso vai executar tantas vezes quantos documentos estiverem na collection (você está dentro do primeiro foreach aqui ainda)

    db.getCollection('TweetsBR_1_copy').insert(registro)
    print("registro inserido:")
    print(registro.text)
});

You are deleting and inserting it again, that must be why each time it runs it deleted more records. Try to run something like this:

// para cada documento na collection "TweetsBR_1_copy"

db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {

    // vou buscar na mesma collection um (ou vários) documento(s) que tenha o mesmo valor do campo "text"

    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {

        // excluir o documento da collection

        db.getCollection('TweetsBR_1_copy').remove(myDoc_2)
        print("registro excluido:")
        print(myDoc_2.text)

        // inserir em uma collection de backup

        print("registro inserido na collection excluidos:")
        print(myDoc_2.text)
        db.getCollection('TweetsBR_1_excluidos').insert(myDoc_2)
    });
});
  • Thank you so much for the answer! As soon as I have a free time I will test. Vlw!

Browser other questions tagged

You are not signed in. Login or sign up in order to post.