Why MongoDB is so slow on some queries while using IXSCAN as a winning plan?
I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).
I have a compound index that filters data based on user_id and some other fields like: is_active
, is_logged_in
, etc.
With MongoDB Compass
I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).
When I get that ongoing op
stat, I see the following lock status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}
acquireCount
: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}
When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.
When I explain the query on the large collection I can see the result that it has used an index:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}
keysExamined
is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4
if that helps.
Another example of a very slow query in full details:
{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}
Output of db.stats()
:
rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}
mongostat
displays the below result, I think fault number is high:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475
And under load we have high page faults:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060
We have about 15GB of free memory that MongoDB
does not use!
On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?
EDIT
iostat
output of disk:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80
Another snapshot:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80
EDIT 2:
The output of iostat 1
in order to remove extra information based on read and write:
https://pastebin.com/32WuRSPF
Another result which tps
reached around 1000:
https://pastebin.com/mrQdKZZF
index query-performance mongodb
bumped to the homepage by Community♦ 12 mins ago
This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.
|
show 7 more comments
I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).
I have a compound index that filters data based on user_id and some other fields like: is_active
, is_logged_in
, etc.
With MongoDB Compass
I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).
When I get that ongoing op
stat, I see the following lock status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}
acquireCount
: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}
When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.
When I explain the query on the large collection I can see the result that it has used an index:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}
keysExamined
is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4
if that helps.
Another example of a very slow query in full details:
{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}
Output of db.stats()
:
rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}
mongostat
displays the below result, I think fault number is high:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475
And under load we have high page faults:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060
We have about 15GB of free memory that MongoDB
does not use!
On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?
EDIT
iostat
output of disk:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80
Another snapshot:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80
EDIT 2:
The output of iostat 1
in order to remove extra information based on read and write:
https://pastebin.com/32WuRSPF
Another result which tps
reached around 1000:
https://pastebin.com/mrQdKZZF
index query-performance mongodb
bumped to the homepage by Community♦ 12 mins ago
This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.
1
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
@JJussi Please see the edit part of the question, I have addediostat
output.
– ALH
Jan 13 '18 at 8:59
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
1
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13
|
show 7 more comments
I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).
I have a compound index that filters data based on user_id and some other fields like: is_active
, is_logged_in
, etc.
With MongoDB Compass
I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).
When I get that ongoing op
stat, I see the following lock status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}
acquireCount
: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}
When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.
When I explain the query on the large collection I can see the result that it has used an index:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}
keysExamined
is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4
if that helps.
Another example of a very slow query in full details:
{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}
Output of db.stats()
:
rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}
mongostat
displays the below result, I think fault number is high:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475
And under load we have high page faults:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060
We have about 15GB of free memory that MongoDB
does not use!
On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?
EDIT
iostat
output of disk:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80
Another snapshot:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80
EDIT 2:
The output of iostat 1
in order to remove extra information based on read and write:
https://pastebin.com/32WuRSPF
Another result which tps
reached around 1000:
https://pastebin.com/mrQdKZZF
index query-performance mongodb
I have DB with about 30M records in a collection with about 100GB collection size (total documents and indexes).
I have a compound index that filters data based on user_id and some other fields like: is_active
, is_logged_in
, etc.
With MongoDB Compass
I see slow queries of about 10s, 20s or even 40 seconds! I ran the exact same query and result is fetched less than 500ms (though it may get cached on second try).
When I get that ongoing op
stat, I see the following lock status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 574
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 295
},
"acquireWaitCount": {
"r": 2
},
"timeAcquiringMicros": {
"r": 15494
}
},
}
acquireCount
: Number of times the operation acquired the lock in the specified mode is so high compared to the fast query (on another collection) that has the below status:
"lockStats": {
"Global": {
"acquireCount": {
"r": 2
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 1
}
},
"Database": {
"acquireCount": {
"r": 1
}
},
"Collection": {
"acquireCount": {
"R": 1
}
}
}
When operations are slow, and a user that has many records take so long it has domino effect on all the other operations after a few seconds.
When I explain the query on the large collection I can see the result that it has used an index:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cuda.call_history",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"user_id" : {
"$eq" : "00000000000040008000000000002a5d"
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"date" : -1
},
"indexName" : "user_id_date",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"date" : [
"("2018-01-09 10:36:31", "1970-01-01 00:00:00")"
]
}
}
},
{
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"is_removed" : {
"$eq" : false
}
},
{
"trk.0.direction" : {
"$eq" : "ingress"
}
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
},
{
"trk.0.data.status" : {
"$in" : [
"p_received",
"success"
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1,
"to" : 1,
"from" : 1
},
"indexName" : "user_id_to_from",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"to" : [
"[MinKey, MaxKey]"
],
"from" : [
"[MinKey, MaxKey]"
]
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 4682,
"executionTimeMillis" : 2072,
"totalKeysExamined" : 4688,
"totalDocsExamined" : 4682,
"executionStages" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$eq" : false
}
},
{
"$nor" : [
{
"trk.0.extra_data.spam.is_spam" : {
"$exists" : true
}
}
]
}
]
},
{
"trk.0.type" : {
"$eq" : "fax"
}
},
{
"date" : {
"$lt" : "2018-01-09 10:36:31"
}
},
{
"date" : {
"$gt" : "1970-01-01 00:00:00"
}
}
]
},
"nReturned" : 4682,
"executionTimeMillisEstimate" : 710,
"works" : 4897,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 209,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"docsExamined" : 4682,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 4682,
"executionTimeMillisEstimate" : 305,
"works" : 4688,
"advanced" : 4682,
"needTime" : 5,
"needYield" : 0,
"saveState" : 234,
"restoreState" : 234,
"isEOF" : 1,
"invalidates" : 1,
"keyPattern" : {
"user_id" : 1,
"trk.0.direction" : 1,
"is_read" : 1,
"trk.0.data.status" : 1,
"is_removed" : 1
},
"indexName" : "user_id_direction_is_read_status_is_removed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"["00000000000040008000000000002a5d", "00000000000040008000000000002a5d"]"
],
"trk.0.direction" : [
"["ingress", "ingress"]"
],
"is_read" : [
"[MinKey, MaxKey]"
],
"trk.0.data.status" : [
"["p_received", "p_received"]",
"["success", "success"]"
],
"is_removed" : [
"[false, false]"
]
},
"keysExamined" : 4688,
"seeks" : 6,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
},
"serverInfo" : {
"host" : “hs1.mydomain.com”,
"port" : 27017,
"version" : "3.4.10",
"gitVersion" : "078f28920cb24de0dd479b5ea6c66c644f6326e9"
},
"ok" : 1.0
}
keysExamined
is just 4,688! It is not much compared to the total data of the collection which is 30M documents. When Mongo gets slow when it has domino effect, CPU usage and Memory is not high. Mongo just uses 40% of the memory. Disk partition is Ext4
if that helps.
Another example of a very slow query in full details:
{
"desc": "conn199276",
"threadId": "140070259820288",
"connectionId": 199276,
"client": "client_server_ip:45590",
"active": "true",
"opid": 63869351,
"secs_running": 36,
"microsecs_running": 36136211,
"op": "query",
"ns": "cuda.call_history",
"query": {
"find": "call_history",
"filter": {
"is_removed": false,
"trk.0.extra_data.spam.is_spam": true,
"trk.0.direction": "ingress",
"date": {
"$gt": "1970-01-01 00:00:00",
"$lt": "4001-01-01 00:00:00"
},
"trk.0.extra_data.status": {
"$in": [
"success",
"p_received"
]
},
"trk.0.type": "clk",
"owner_id": "00000000000040008000000000003828"
},
"sort": {
"date": -1
},
"limit": 31
},
"numYields": 6600,
"locks": {},
"waitingForLock": "false",
"lockStats": {
"Global": {
"acquireCount": {
"r": 13200
}
},
"MMAPV1Journal": {
"acquireCount": {
"r": 6611
},
"acquireWaitCount": {
"r": 9
},
"timeAcquiringMicros": {
"r": 50854
}
},
"Database": {
"acquireCount": {
"r": 6600
}
},
"Collection": {
"acquireCount": {
"R": 6600
},
"acquireWaitCount": {
"R": 11
},
"timeAcquiringMicros": {
"R": 163707
}
}
}
}
Output of db.stats()
:
rs0:PRIMARY> db.stats()
{
"db" : "cuda",
"collections" : 5,
"views" : 0,
"objects" : 55009248,
"avgObjSize" : 2018.6135346551184,
"dataSize" : 111042412544,
"storageSize" : 113055362336,
"numExtents" : 100,
"indexes" : 7,
"indexSize" : 14223460160,
"fileSize" : 133012914176,
"nsSizeMB" : 16,
"extentFreeList" : {
"num" : 0,
"totalSize" : 0
},
"dataFileVersion" : {
"major" : 4,
"minor" : 22
},
"ok" : 1
}
mongostat
displays the below result, I think fault number is high:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
5 93 4 *0 0 64|0 0 282G 9.11G 26 0|0 0|0 64.3k 187k 481 rs0 PRI Jan 10 06:25:14.476
*0 107 *0 1 0 58|0 0 282G 9.14G 4 0|0 0|0 51.5k 247k 481 rs0 PRI Jan 10 06:25:15.475
2 88 5 *0 0 70|0 0 282G 9.04G 26 0|0 0|0 61.5k 245k 481 rs0 PRI Jan 10 06:25:16.476
3 98 2 *0 0 71|0 0 282G 9.12G 6 0|0 0|0 59.6k 274k 481 rs0 PRI Jan 10 06:25:17.474
1 105 *0 1 0 82|0 0 282G 9.10G 14 0|0 0|0 63.4k 254k 481 rs0 PRI Jan 10 06:25:18.476
1 88 2 *0 0 68|0 0 282G 9.12G 75 0|0 0|0 49.9k 194k 481 rs0 PRI Jan 10 06:25:19.476
5 120 4 *0 0 78|0 0 282G 9.11G 5 0|0 0|0 75.7k 548k 484 rs0 PRI Jan 10 06:25:20.476
3 89 *0 4 0 64|0 0 282G 9.14G 19 0|0 0|0 55.3k 1.59m 483 rs0 PRI Jan 10 06:25:21.476
*0 81 *0 *0 0 62|0 0 282G 9.17G 0 0|0 0|0 46.9k 265k 481 rs0 PRI Jan 10 06:25:22.476
6 105 3 *0 0 67|0 0 282G 9.15G 49 0|0 0|0 61.8k 240k 481 rs0 PRI Jan 10 06:25:23.475
And under load we have high page faults:
insert query update delete getmore command flushes mapped vsize res faults qrw arw net_in net_out conn set repl time
1 96 *0 *0 0 70|0 0 282G 10.9G 597 0|0 2|0 59.1k 1.06m 484 rs0 PRI Jan 10 07:48:45.061
1 103 3 *0 0 79|0 0 282G 10.9G 715 0|0 0|0 66.4k 176k 484 rs0 PRI Jan 10 07:48:46.060
2 102 *0 *0 0 72|0 0 282G 10.9G 718 0|0 2|0 59.6k 154k 484 rs0 PRI Jan 10 07:48:47.060
2 104 2 1 0 83|0 0 282G 11.0G 692 0|0 3|0 68.7k 154k 483 rs0 PRI Jan 10 07:48:48.061
2 103 2 *0 0 86|0 0 282G 11.0G 743 0|0 1|0 64.2k 154k 482 rs0 PRI Jan 10 07:48:49.059
*0 116 2 *0 0 95|0 0 282G 11.0G 715 0|0 2|0 68.5k 237k 484 rs0 PRI Jan 10 07:48:50.060
1 112 6 1 0 78|0 0 282G 11.0G 595 0|0 2|0 74.3k 231k 481 rs0 PRI Jan 10 07:48:51.061
2 137 5 1 0 83|0 0 282G 11.0G 733 0|0 3|0 81.6k 301k 480 rs0 PRI Jan 10 07:48:52.060
3 157 5 1 0 92|0 0 282G 11.0G 697 0|0 3|0 89.5k 183k 481 rs0 PRI Jan 10 07:48:53.059
4 102 3 *0 0 73|0 0 282G 10.9G 599 0|0 1|0 67.1k 154k 480 rs0 PRI Jan 10 07:48:54.060
We have about 15GB of free memory that MongoDB
does not use!
On peak we reach 400 OPS, and 500 connections per seconds. Is there anything I could do to improve the performance?
EDIT
iostat
output of disk:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 87.00 7.00 392.00 460.00 2544.00 15.06 0.08 0.22 8.00 0.08 0.12 4.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 7.00 478.00 460.00 2536.00 12.35 0.09 0.19 8.00 0.08 0.10 4.80
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 84.00 6.00 3620.00 56.00 81.69 0.82 8.84 9.48 0.00 4.13 37.20
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 85.00 6.00 3692.00 56.00 82.37 0.82 8.75 9.36 0.00 4.09 37.20
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 0.00 57.00 3.00 2804.00 52.00 95.20 0.44 7.73 7.79 6.67 4.80 28.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 56.00 3.00 2732.00 52.00 94.37 0.44 7.80 7.86 6.67 4.88 28.80
Another snapshot:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 11.00 267.67 5.00 20813.33 97.33 153.38 1.28 4.68 4.70 3.73 3.56 97.07
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 267.67 16.00 20806.67 97.33 147.38 1.28 4.50 4.70 1.17 3.42 97.07
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
sda 0.00 21.00 221.67 3.00 19089.33 110.67 170.92 1.18 5.26 5.16 12.44 3.95 88.80
dm-0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
dm-2 0.00 0.00 221.33 24.00 19061.33 110.67 156.29 1.18 4.82 5.17 1.56 3.62 88.80
EDIT 2:
The output of iostat 1
in order to remove extra information based on read and write:
https://pastebin.com/32WuRSPF
Another result which tps
reached around 1000:
https://pastebin.com/mrQdKZZF
index query-performance mongodb
index query-performance mongodb
edited Jan 13 '18 at 14:08
ALH
asked Jan 10 '18 at 13:25
ALHALH
1,67682737
1,67682737
bumped to the homepage by Community♦ 12 mins ago
This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.
bumped to the homepage by Community♦ 12 mins ago
This question has answers that may be good or bad; the system has marked it active so that they can be reviewed.
1
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
@JJussi Please see the edit part of the question, I have addediostat
output.
– ALH
Jan 13 '18 at 8:59
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
1
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13
|
show 7 more comments
1
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
@JJussi Please see the edit part of the question, I have addediostat
output.
– ALH
Jan 13 '18 at 8:59
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
1
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13
1
1
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
@JJussi Please see the edit part of the question, I have added
iostat
output.– ALH
Jan 13 '18 at 8:59
@JJussi Please see the edit part of the question, I have added
iostat
output.– ALH
Jan 13 '18 at 8:59
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
1
1
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13
|
show 7 more comments
1 Answer
1
active
oldest
votes
High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.
I suggest making sure your system complies with the Recommended Configuration as per Production Notes.
How can I set readahead inMongoDB
? Is there a way to set this onMongoDB MMAPV1
?
– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to256
, I checked it withblockdev --report
command. Is that a high number? Should I lower the value forRA
(readahead)?
– ALH
Feb 24 '18 at 8:30
add a comment |
Your Answer
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "182"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdba.stackexchange.com%2fquestions%2f195065%2fwhy-mongodb-is-so-slow-on-some-queries-while-using-ixscan-as-a-winning-plan%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.
I suggest making sure your system complies with the Recommended Configuration as per Production Notes.
How can I set readahead inMongoDB
? Is there a way to set this onMongoDB MMAPV1
?
– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to256
, I checked it withblockdev --report
command. Is that a high number? Should I lower the value forRA
(readahead)?
– ALH
Feb 24 '18 at 8:30
add a comment |
High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.
I suggest making sure your system complies with the Recommended Configuration as per Production Notes.
How can I set readahead inMongoDB
? Is there a way to set this onMongoDB MMAPV1
?
– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to256
, I checked it withblockdev --report
command. Is that a high number? Should I lower the value forRA
(readahead)?
– ALH
Feb 24 '18 at 8:30
add a comment |
High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.
I suggest making sure your system complies with the Recommended Configuration as per Production Notes.
High disk readahead could be the reason why physical memory is underutilized. If it's high, for random reads the system might be reading a lot of data MongoDB does not need, thus "polluting" the file system cache. That in return could result in extra page faults.
I suggest making sure your system complies with the Recommended Configuration as per Production Notes.
answered Jan 16 '18 at 1:07
VaudevilleVillainVaudevilleVillain
1
1
How can I set readahead inMongoDB
? Is there a way to set this onMongoDB MMAPV1
?
– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to256
, I checked it withblockdev --report
command. Is that a high number? Should I lower the value forRA
(readahead)?
– ALH
Feb 24 '18 at 8:30
add a comment |
How can I set readahead inMongoDB
? Is there a way to set this onMongoDB MMAPV1
?
– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to256
, I checked it withblockdev --report
command. Is that a high number? Should I lower the value forRA
(readahead)?
– ALH
Feb 24 '18 at 8:30
How can I set readahead in
MongoDB
? Is there a way to set this on MongoDB MMAPV1
?– ALH
Feb 24 '18 at 8:25
How can I set readahead in
MongoDB
? Is there a way to set this on MongoDB MMAPV1
?– ALH
Feb 24 '18 at 8:25
RA
in my disk server is set to 256
, I checked it with blockdev --report
command. Is that a high number? Should I lower the value for RA
(readahead)?– ALH
Feb 24 '18 at 8:30
RA
in my disk server is set to 256
, I checked it with blockdev --report
command. Is that a high number? Should I lower the value for RA
(readahead)?– ALH
Feb 24 '18 at 8:30
add a comment |
Thanks for contributing an answer to Database Administrators Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdba.stackexchange.com%2fquestions%2f195065%2fwhy-mongodb-is-so-slow-on-some-queries-while-using-ixscan-as-a-winning-plan%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
Looks like that everything slows down when mongod goes to the disk. How many IOPS your disks can handle? Especially read IOPS and latency.
– JJussi
Jan 12 '18 at 5:43
@JJussi Please see the edit part of the question, I have added
iostat
output.– ALH
Jan 13 '18 at 8:59
@JJussi thank you for your time, how can I see that latency? is r_await, what we're looking for?
– ALH
Jan 13 '18 at 10:25
?_await is average wait time in ms. You could run iostat without x-parameter and watch tps value to see iops. thattommyhall.com/2011/02/18/iops-linux-iostat Strange thing with mongodb is that latency plays a big role and it is hard to measure especially random read latency. Also "how storage is build" plays big role. F.ex. LVM on top of RAID is fast. RAID on top of LVM is slow, at least when both are software base.
– JJussi
Jan 13 '18 at 12:51
1
No, disk is not bottle neck for direct reads. Have to raise my hands... However, that memory consumption is still little bit mystery. Mongod should use all free memory.
– JJussi
Jan 15 '18 at 4:13