Curious to know how the MongoDB Index data being stored at underlying Data Files ?!!

The data directory of a MongoDB server is composed by the WiredTiger storage engine. If someone want to have a look at the content inside them you will need to use the “wt” (Wired Tiger) tool from the WiredTiger library which you can find from below URL.

https://source.wiredtiger.com/10.0.0/build-posix.html#posix_github ( OR )
https://github.com/wiredtiger/wiredtiger/

Nevertheless, checking out the WiredTiger files is not a mandatory MongoDB skill, but you can play around these files in case of any crash recovery cases when you don’t have replica members available.

Follow steps given at https://source.wiredtiger.com/10.0.0/build-posix.html#posix_github to get the WT tool installed.

Installation on RHEL -8 VM viewing_mongodb_Index_Data_using_wt_tool

Step1 : Installing all the prerequistes

[root@ip-172-31-5-247 ec2-user]# yum install git
[root@ip-172-31-5-247 ec2-user]# yum install wget
[root@ip-172-31-5-247 ec2-user]# yum install make
[root@ip-172-31-5-247 ec2-user]# dnf group install "Development Tools"
[root@ip-172-31-5-247 ec2-user]# git clone https://github.com/wiredtiger/wiredtiger.git

[root@ip-172-31-5-247 ec2-user]# cd wiredtiger/
[root@ip-172-31-5-247 wiredtiger]# git branch --all | grep mongodb
remotes/origin/mongodb-3.0
remotes/origin/mongodb-3.2
remotes/origin/mongodb-3.4
remotes/origin/mongodb-3.6
remotes/origin/mongodb-4.0
remotes/origin/mongodb-4.2
remotes/origin/mongodb-4.4
remotes/origin/mongodb-5.0
remotes/origin/mongodb-5.1
remotes/origin/mongodb-5.2
remotes/origin/mongodb-5.3
remotes/origin/mongodb-6.0
remotes/origin/mongodb-master
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# git checkout mongodb-4.4
Branch 'mongodb-4.4' set up to track remote branch 'mongodb-4.4' from 'origin'.
Switched to a new branch 'mongodb-4.4'
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# sh -x autogen.sh
[root@ip-172-31-5-247 wiredtiger]# git clone https://github.com/google/snappy.git
[root@ip-172-31-5-247 wiredtiger]# curl -O https://vault.centos.org/centos/8/PowerTools/x86_64/os/Packages/snappy-devel-1.1.8-3.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]# rpm -vhi snappy-devel-1.1.8-3.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]# yum install cmake-filesystem
[root@ip-172-31-5-247 wiredtiger]# rpm -vhi snappy-devel-1.1.8-3.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]# git clone https://github.com/kjdev/php-ext-lz4
[root@ip-172-31-5-247 wiredtiger]# yum install lz4-devel
[root@ip-172-31-5-247 wiredtiger]# ./configure --disable-shared --with-builtins=lz4,snappy,zlib,zstd
[root@ip-172-31-5-247 wiredtiger]# make -j $(nproc)

[root@ip-172-31-5-247 wiredtiger]# sudo make install
[root@ip-172-31-5-247 wiredtiger]# dnf install libcurl-devel
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 ec2-user]# dnf install python3-devel openssl-devel
[root@ip-172-31-5-247 wiredtiger]# /usr/local/bin/wt

[root@ip-172-31-5-247 wiredtiger]# pwd
/home/ec2-user/wiredtiger
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# ll -lhtr
total 28M
-rw-r--r--. 1 root root 417 Apr 19 09:13 RELEASE_INFO
-rw-r--r--. 1 root root 716 Apr 19 09:13 README
-rw-r--r--. 1 root root 353 Apr 19 09:13 INSTALL
drwxr-xr-x. 3 root root 20 Apr 19 09:13 lang
drwxr-xr-x. 4 root root 35 Apr 19 09:14 bench
drwxr-xr-x. 28 root root 4.0K Apr 19 09:14 src
-rw-r--r--. 1 root root 5.7K Apr 19 09:14 CMakeLists.txt
-rw-r--r--. 1 root root 833 Apr 19 09:14 LICENSE
-rw-r--r--. 1 root root 149K Apr 19 09:14 NEWS
-rw-r--r--. 1 root root 14K Apr 19 09:14 SConstruct
-rwxr-xr-x. 1 root root 119 Apr 19 09:14 autogen.sh
drwxr-xr-x. 2 root root 55 Apr 19 09:14 build_win
drwxr-xr-x. 6 root root 106 Apr 19 09:14 cmake
drwxr-xr-x. 4 root root 51 Apr 19 09:14 examples
drwxr-xr-x. 2 root root 4.0K Apr 19 09:14 dist
drwxr-xr-x. 8 root root 135 Apr 19 09:14 ext
drwxr-xr-x. 26 root root 4.0K Apr 19 09:14 test
drwxr-xr-x. 4 root root 92 Apr 19 09:14 tools
-rw-r--r--. 1 root root 11K Apr 19 09:14 configure.ac
-rw-r--r--. 1 root root 19K Apr 19 09:14 Makefile.am
-rw-r--r--. 1 root root 51K Apr 19 09:14 aclocal.m4
-rwxr-xr-x. 1 root root 655K Apr 19 09:14 configure
drwxr-xr-x. 4 root root 172 Apr 19 09:14 build_posix
-rw-r--r--. 1 root root 155K Apr 19 09:14 Makefile.in
drwxr-xr-x. 8 root root 4.0K Apr 19 09:32 snappy
-rw-r--r--. 1 root root 26K Apr 19 11:38 snappy-devel-1.1.8-3.el8.x86_64.rpm
drwxr-xr-x. 7 root root 4.0K Apr 19 11:50 php-ext-lz4
-rw-r--r--. 1 root root 927 Apr 19 11:54 libwiredtiger.la
-rwxr-xr-x. 1 root root 26M Apr 19 11:54 wt
-rwxr-xr-x. 1 root root 73K Apr 19 12:53 config.status
-rw-r--r--. 1 root root 143K Apr 19 12:53 Makefile
-rw-r--r--. 1 root root 307K Apr 19 12:53 wiredtiger.h
-rw-r--r--. 1 root root 20K Apr 19 12:53 wiredtiger_ext.h
-rw-r--r--. 1 root root 237 Apr 19 12:53 wiredtiger.pc
-rw-r--r--. 1 root root 5.5K Apr 19 12:53 wiredtiger_config.h
-rw-r--r--. 1 root root 34 Apr 19 12:53 stamp-h1
-rwxr-xr-x. 1 root root 337K Apr 19 12:53 libtool
-rw-r--r--. 1 root root 60K Apr 19 12:53 config.log
[root@ip-172-31-5-247 wiredtiger]#

Step2 : Installing MongoDB 4.4 and creating collection and adding an Index

[root@ip-172-31-5-247 wiredtiger]# curl -O https://repo.mongodb.org/yum/redhat/8/mongodb-org/4.4/x86_64/RPMS/mongodb-org-shell-4.4.13-1.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]# rpm -ivh mongodb-org-shell-4.4.13-1.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# curl -O https://repo.mongodb.org/yum/redhat/8/mongodb-org/4.4/x86_64/RPMS/mongodb-org-server-4.4.13-1.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]# rpm -ivh mongodb-org-server-4.4.13-1.el8.x86_64.rpm
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# mkdir -p /data/db
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# mongod --logpath ./mongod.log --fork
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# mongo
---
> use mydb
switched to db mydb
>
> for(i=0;i<10000;i++){db.mycol.insert({_id:i, name:'abc'+i, value: 1000*i+10000 })} WriteResult({ "nInserted" : 1 }) >
> db.mycol.createIndex({name:1})
{
"createdCollectionAutomatically" : false,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
>
> db.mycol.getIndexes()
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_"
},
{
"v" : 2,
"key" : {
"name" : 1
},
"name" : "name_1"
}
]
> exit
bye
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# ll -lhrt /data/db/
total 816K
-rw-------. 1 root root 21 Apr 19 13:11 WiredTiger.lock
-rw-------. 1 root root 50 Apr 19 13:11 WiredTiger
-rw-------. 1 root root 4.0K Apr 19 13:11 WiredTigerHS.wt
-rw-------. 1 root root 6 Apr 19 13:11 mongod.lock
drwx------. 2 root root 110 Apr 19 13:11 journal
-rw-------. 1 root root 114 Apr 19 13:11 storage.bson
-rw-------. 1 root root 4.0K Apr 19 13:11 collection-4--588032764603920700.wt
-rw-------. 1 root root 4.0K Apr 19 13:11 index-5--588032764603920700.wt
-rw-------. 1 root root 4.0K Apr 19 13:11 index-6--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:12 index-3--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:12 index-1--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:12 collection-2--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:12 collection-0--588032764603920700.wt
-rw-------. 1 root root 124K Apr 19 13:13 index-8--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:13 sizeStorer.wt
-rw-------. 1 root root 36K Apr 19 13:13 _mdb_catalog.wt
-rw-------. 1 root root 184K Apr 19 13:13 collection-7--588032764603920700.wt
-rw-------. 1 root root 72K Apr 19 13:13 WiredTiger.wt
-rw-------. 1 root root 1.5K Apr 19 13:13 WiredTiger.turtle
-rw-------. 1 root root 132K Apr 19 13:13 index-9--588032764603920700.wt
drwx------. 2 root root 71 Apr 19 13:13 diagnostic.data
[root@ip-172-31-5-247 wiredtiger]#

[root@ip-172-31-5-247 db]# curl -O https://fastdl.mongodb.org/tools/db/mongodb-database-tools-rhel80-x86_64-100.5.2.rpm
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# yum install cyrus-sasl cyrus-sasl-gssapi cyrus-sasl-plain
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# rpm -ivh mongodb-database-tools-rhel80-x86_64-100.5.2.rpm
warning: mongodb-database-tools-rhel80-x86_64-100.5.2.rpm: Header V3 RSA/SHA1 Signature, key ID 90cfb1f5: NOKEY
Verifying... ################################# [100%]
Preparing... ################################# [100%]
Updating / installing...
1:mongodb-database-tools-100.5.2-1 ################################# [100%]
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#

[root@ip-172-31-5-247 wiredtiger]#
[root@ip-172-31-5-247 wiredtiger]# cd /data/db/
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# mongo --quiet

> use admin
switched to db admin
>
> db.shutdownServer()
server should be down...
>
>
> exit
{"t":{"$date":"2022-04-19T13:16:07.020Z"},"s":"I", "c":"QUERY", "id":22791, "ctx":"js","msg":"Failed to end logical session","attr":{"lsid":{"id":{"$uuid":"cafac471-85f1-49b8-a8f4-b3ff170c7c5e"}},"error":{"code":9001,"codeName":"SocketException","errmsg":"socket exception [CONNECT_ERROR] server [couldn't connect to server 127.0.0.1:27017, connection attempt failed: SocketException: Error connecting to 127.0.0.1:27017 :: caused by :: Connection refused]"}}}
[root@ip-172-31-5-247 db]#


As we’ve creted Index on mycol collection from mydb database on the field ‘name’ as an ascending order, let’s check the data inside the Index file whether it’s storing it Text Ascending Sort.

[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# yum install jq

[root@ip-172-31-5-247 db]# yum install vim-common
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# wt dump -x table:_mdb_catalog | tail -n +7 | awk 'NR%2 == 0 { print }' | xxd -r -p | bsondump --quiet | jq -r 'select(. | has("md")) | [.ident, .ns] | @tsv' | sort > wt_ident_vs_collection_ns.tsv
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# ll -lhtr
total 48M
-rw-------. 1 root root 21 Apr 19 13:11 WiredTiger.lock
-rw-------. 1 root root 50 Apr 19 13:11 WiredTiger
drwx------. 2 root root 110 Apr 19 13:11 journal
-rw-------. 1 root root 114 Apr 19 13:11 storage.bson
-rw-------. 1 root root 132K Apr 19 13:13 index-9--588032764603920700.wt
-rw-------. 1 root root 124K Apr 19 13:16 index-8--588032764603920700.wt
-rw-------. 1 root root 36K Apr 19 13:16 sizeStorer.wt
-rw-------. 1 root root 4.0K Apr 19 13:16 index-6--588032764603920700.wt
-rw-------. 1 root root 4.0K Apr 19 13:16 index-5--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:16 index-3--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:16 index-1--588032764603920700.wt
-rw-------. 1 root root 184K Apr 19 13:16 collection-7--588032764603920700.wt
-rw-------. 1 root root 4.0K Apr 19 13:16 collection-4--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:16 collection-2--588032764603920700.wt
-rw-------. 1 root root 20K Apr 19 13:16 collection-0--588032764603920700.wt
-rw-------. 1 root root 0 Apr 19 13:16 mongod.lock
drwx------. 2 root root 48 Apr 19 13:16 diagnostic.data
-rw-r--r--. 1 root root 48M Apr 19 13:18 mongodb-database-tools-rhel80-x86_64-100.5.2.rpm
-rw-------. 1 root root 4.0K Apr 19 13:19 WiredTigerHS.wt
-rw-------. 1 root root 36K Apr 19 13:19 _mdb_catalog.wt
-rw-r--r--. 1 root root 1.5K Apr 19 13:19 WiredTiger.turtle
-rw-------. 1 root root 52K Apr 19 13:19 WiredTiger.wt
-rw-r--r--. 1 root root 205 Apr 19 13:19 wt_ident_vs_collection_ns.tsv
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# head wt_ident_vs_collection_ns.tsv
collection-0--588032764603920700 admin.system.version
collection-2--588032764603920700 local.startup_log
collection-4--588032764603920700 config.system.sessions
collection-7--588032764603920700 mydb.mycol
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#

To convert the Index Bindary data using bsondump & read it using WT dump and write to a file with name : wt_ident_vs_index_ns.tsv

[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# wt dump -x table:_mdb_catalog | tail -n +7 | awk 'NR%2 == 0 { print }' | xxd -r -p | bsondump --quiet | jq -r 'select(. | has("idxIdent")) | .ns as $nsT | .idxIdent | to_entries[] | [.value, $nsT, .key] | @tsv' | sort > wt_ident_vs_index_ns.tsv
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# cat wt_ident_vs_index_ns.tsv
index-1--588032764603920700 admin.system.version _id_
index-3--588032764603920700 local.startup_log _id_
index-5--588032764603920700 config.system.sessions _id_
index-6--588032764603920700 config.system.sessions lsidTTLIndex
index-8--588032764603920700 mydb.mycol _id_
index-9--588032764603920700 mydb.mycol name_1
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#

Reading Data from Index File with Index as { name : 1 }

[root@ip-172-31-5-247 db]# wt dump index-9--588032764603920700 | head -15
WiredTiger Dump (WiredTiger Version 10.0.2)
Format=print
Header
table:index-9--588032764603920700
access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=8),assert=(commit_timestamp=none,durable_timestamp=none,read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=,cache_resident=false,checksum=on,colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,extractor=,format=btree,huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,immutable=false,import=(enabled=false,file_metadata=,repair=false),internal_item_max=0,internal_key_max=0,internal_key_truncate=true,internal_page_max=16k,key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=16k,leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8,bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB,chunk_size=10MB,merge_custom=(prefix=,start_generation=0,suffix=),merge_max=15,merge_min=0),memory_page_image_max=0,memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=true,prefix_compression_min=4,readonly=false,source="file:index-9--588032764603920700.wt",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,tiered_object=false,tiered_storage=(auth_token=,bucket=,bucket_prefix=,cache_directory=,local_retention=300,name=,object_target_size=10M),type=file,value_format=u,verbose=[],write_timestamp_usage=none
Data

You can observe the data in Ascending order in the Index file i.e., abc0, abc1, abc10, acb100, abc1000, Whereas the actual data we see in the collection is in the order of abc0, abc1, abc2 .. etc

[root@ip-172-31-5-247 db]# mongod --logpath ./mongod.log --fork
about to fork child process, waiting until server is ready for connections.
forked process: 74249
child process started successfully, parent exiting
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# mongo localhost:27017/mydb --quiet
> db.mycol.find({},{name:1,_id:0})
{ "name" : "abc0" }
{ "name" : "abc1" }
{ "name" : "abc2" }
{ "name" : "abc3" }
{ "name" : "abc4" }
{ "name" : "abc5" }
{ "name" : "abc6" }
{ "name" : "abc7" }
{ "name" : "abc8" }
{ "name" : "abc9" }
{ "name" : "abc10" }
{ "name" : "abc11" }
{ "name" : "abc12" }
{ "name" : "abc13" }
{ "name" : "abc14" }
{ "name" : "abc15" }
{ "name" : "abc16" }
{ "name" : "abc17" }
{ "name" : "abc18" }
{ "name" : "abc19" }
Type "it" for more
>
>

Similarly, you can also see the collection data using below commands

[root@ip-172-31-5-247 db]# ll -lhtr collection-*
-rw-------. 1 root root 24K Apr 19 16:45 collection-4--588032764603920700.wt
-rw-------. 1 root root 36K Apr 20 15:19 collection-2--588032764603920700.wt
-rw-------. 1 root root 20K Apr 20 15:19 collection-0--588032764603920700.wt
-rw-------. 1 root root 232K Apr 20 15:19 collection-0-3779508954732058151.wt
-rw-------. 1 root root 184K Apr 20 15:20 collection-7--588032764603920700.wt
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# wt dump -x collection-7--588032764603920700 | tail -n +7 | awk 'NR%2 == 0 { print }' | xxd -r -p > mydb.mycol.bson
[root@ip-172-31-5-247 db]#
[root@ip-172-31-5-247 db]# bsondump --quiet mydb.mycol.bson | head -20
{"_id":{"$numberDouble":"0.0"},"name":"abc0","value":{"$numberDouble":"10000.0"}}
{"_id":{"$numberDouble":"1.0"},"name":"abc1","value":{"$numberDouble":"11000.0"}}
{"_id":{"$numberDouble":"2.0"},"name":"abc2","value":{"$numberDouble":"12000.0"}}
{"_id":{"$numberDouble":"3.0"},"name":"abc3","value":{"$numberDouble":"13000.0"}}
{"_id":{"$numberDouble":"4.0"},"name":"abc4","value":{"$numberDouble":"14000.0"}}
{"_id":{"$numberDouble":"5.0"},"name":"abc5","value":{"$numberDouble":"15000.0"}}
{"_id":{"$numberDouble":"6.0"},"name":"abc6","value":{"$numberDouble":"16000.0"}}
{"_id":{"$numberDouble":"7.0"},"name":"abc7","value":{"$numberDouble":"17000.0"}}
{"_id":{"$numberDouble":"8.0"},"name":"abc8","value":{"$numberDouble":"18000.0"}}
{"_id":{"$numberDouble":"9.0"},"name":"abc9","value":{"$numberDouble":"19000.0"}}
{"_id":{"$numberDouble":"10.0"},"name":"abc10","value":{"$numberDouble":"20000.0"}}
{"_id":{"$numberDouble":"11.0"},"name":"abc11","value":{"$numberDouble":"21000.0"}}
{"_id":{"$numberDouble":"12.0"},"name":"abc12","value":{"$numberDouble":"22000.0"}}
{"_id":{"$numberDouble":"13.0"},"name":"abc13","value":{"$numberDouble":"23000.0"}}
{"_id":{"$numberDouble":"14.0"},"name":"abc14","value":{"$numberDouble":"24000.0"}}
{"_id":{"$numberDouble":"15.0"},"name":"abc15","value":{"$numberDouble":"25000.0"}}
{"_id":{"$numberDouble":"16.0"},"name":"abc16","value":{"$numberDouble":"26000.0"}}
{"_id":{"$numberDouble":"17.0"},"name":"abc17","value":{"$numberDouble":"27000.0"}}
{"_id":{"$numberDouble":"18.0"},"name":"abc18","value":{"$numberDouble":"28000.0"}}
{"_id":{"$numberDouble":"19.0"},"name":"abc19","value":{"$numberDouble":"29000.0"}}
[root@ip-172-31-5-247 db]#

Hope this is helpful !

References : https://www.percona.com/blog/2021/06/22/wiredtiger-file-forensics-part-3-viewing-all-the-mongodb-data/

  • Ask Question