Removing duplicated in mongodb not behaving as expected -
i have table in mongodb have realized has duplicated due required data parsing update based on underlying source data.
due change in source, code having unexpected behavior , inserting many duplicates.
the following query should return single value:
db.opts.find({ $query: { ticker: "vxx", date: 20150423, callput: "p", strike: 27, maturity: 20150424 }, $orderby: { maturity: 1 } })
yet due bug in code, have multiple entries observation unfortunately. 1 of observations looks this:
{ "_id": objectid("55396c1c44fea47bde858c78"), "date": 20150423, "ticker": "vxx", "callput": "p", "last": 6.1, "vol": 25, "chg": 0.43, "maturity": 20150424, "symbol": "vxx150424p00027000", "open int": 809, "strike": 27, "ask": 6.1, "bid": 5.85 }
now, goal remove duplicates share fields.
i tried running following:
db.opts.ensureindex({ date: 1, ticker: 1, callput: 1, maturity: 1, symbol: 1, strike: 1 }, { unique: true, dropdups: true })
yet duplicates did not drop.
i additionally tried:
db.opts.createindex({ date: 1, ticker: 1, callput: 1, maturity: 1, symbol: 1, strike: 1 }, { unique: true, dropdups: true })
i not have indices defined on these fields outside of documented above. collection quite vanilla created , insert data daily. nothing else yet.
the duplicate data looks this:
> db.opts.find({$query:{ticker:"vxx",date:20150423,callput:"p",strike:27}}) { "_id" : objectid("55396c1c44fea47bde858c78"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1c44fea47bde858cd1"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1c44fea47bde858d2a"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1d44fea47bde858d83"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1d44fea47bde858ddc"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1d44fea47bde858e35"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1e44fea47bde858e8e"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1e44fea47bde858ee7"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1e44fea47bde858f40"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1f44fea47bde858f99"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c1f44fea47bde858ff2"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2044fea47bde85904b"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2044fea47bde8590a4"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2044fea47bde8590fd"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2144fea47bde859156"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2144fea47bde8591af"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2244fea47bde859208"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2244fea47bde859261"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2244fea47bde8592ba"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } { "_id" : objectid("55396c2344fea47bde859313"), "date" : 20150423, "ticker" : "v xx", "callput" : "p", "last" : 6.1, "vol" : 25, "chg" : 0.43, "maturity" : 20150 424, "symbol" : "vxx150424p00027000", "open int" : 809, "strike" : 27, "ask" : 6 .1, "bid" : 5.85 } type "it" more >
how remove these duplicates?
the dropdups
option no longer available in 3.0, can pretty in shell little script iterates on whole collection , removes docs duplicate ones have same key values:
var keys = {}; db.opts.find().foreach(function(doc) { // create key concatenates keys must unique. var key = ''.concat(doc.date, doc.ticker, doc.callput, doc.maturity, doc.symbol, doc.strike); if (keys[key]) { // doc key has been seen, remove doc. db.opts.remove({_id: doc._id}); } else { keys[key] = true; } });
obviously, make backup before doing in case doesn't work you're expecting.
Comments
Post a Comment