MongoDB-11-索引模块-数据结构

1. 核心数据结构概览

索引模块的数据结构设计采用分层架构,从上层的索引描述符到底层的存储结构,形成了完整的索引数据管理体系。

1.1 数据结构分类

  • 描述符层: IndexDescriptor、IndexCatalogEntry
  • 访问方法层: IndexAccessMethod及其子类
  • 键值处理层: KeyString、BSONObjSet
  • 存储接口层: SortedDataInterface、RecordId
  • 构建器层: IndexBuilder、BulkBuilder

2. 核心类图

2.1 索引描述符类图

classDiagram
    class IndexDescriptor {
        -BSONObj _keyPattern
        -string _indexName
        -BSONObj _infoObj
        -IndexVersion _version
        -bool _isIdIndex
        -bool _isSparse
        -bool _isUnique
        -bool _isPartial
        -PartialFilterExpression _partialFilterExpression
        +keyPattern() BSONObj
        +indexName() string
        +unique() bool
        +sparse() bool
        +partial() bool
        +getAccessMethodName() string
        +compareIndexOptions() bool
    }

    class IndexCatalogEntry {
        -shared_ptr~IndexDescriptor~ _descriptor
        -unique_ptr~IndexAccessMethod~ _accessMethod
        -shared_ptr~Ident~ _ident
        -bool _isReady
        -bool _isFrozen
        -Timestamp _readyTimestamp
        -BSONObj _multikeyPaths
        +descriptor() IndexDescriptor*
        +accessMethod() IndexAccessMethod*
        +isReady() bool
        +setIsReady(bool ready)
        +ident() Ident*
    }

    class PartialFilterExpression {
        -unique_ptr~MatchExpression~ _expression
        -BSONObj _serializedExpression
        +getFilter() MatchExpression*
        +serialize() BSONObj
        +matches(BSONObj doc) bool
    }

    class Ident {
        -string _identString
        -NamespaceString _nss
        -string _indexName
        +toString() string
        +ns() NamespaceString
        +getIdent() string
    }

    IndexCatalogEntry --> IndexDescriptor : contains
    IndexDescriptor --> PartialFilterExpression : uses
    IndexCatalogEntry --> Ident : references

2.2 索引访问方法类图

classDiagram
    class IndexAccessMethod {
        <<abstract>>
        +insert(opCtx, records)* Status
        +remove(opCtx, obj, loc)* void
        +update(opCtx, oldDoc, newDoc)* Status
        +initializeAsEmpty()* Status
        +validate(opCtx, options)* IndexValidateResults
        +numKeys(opCtx)* int64_t
        +getSpaceUsedBytes(opCtx)* long long
        +make(opCtx, entry, ident)$ unique_ptr~IndexAccessMethod~
    }

    class SortedDataIndexAccessMethod {
        <<abstract>>
        -unique_ptr~SortedDataInterface~ _newInterface
        -IndexCatalogEntry* _btreeState
        +getKeys(obj, keys)* void
        +getSortedDataInterface() SortedDataInterface*
        +setIndexIsMultikey(paths) void
    }

    class BtreeAccessMethod {
        +BtreeAccessMethod(entry, interface)
        +getKeys(obj, keys) void
        +insert(opCtx, records) Status
        +remove(opCtx, obj, loc) void
        -_extractKeys(obj, keySet) void
        -_handleArrayField(elem, keys) void
    }

    class HashAccessMethod {
        +HashAccessMethod(entry, interface)
        +getKeys(obj, keys) void
        -_hash(elem) long long
        -_makeSingleKey(elem, hashValue) BSONObj
    }

    class S2AccessMethod {
        -unique_ptr~S2IndexingParams~ _params
        +S2AccessMethod(entry, interface)
        +getKeys(obj, keys) void
        -_getS2Keys(geoElement, keys) void
        -_getCellsForGeometry(elem, cells) void
    }

    class FTSAccessMethod {
        -unique_ptr~FTSSpec~ _ftsSpec
        +FTSAccessMethod(entry, interface)
        +getKeys(obj, keys) void
        -_scoreDocument(doc) TermFrequencyMap
        -_extractTerms(text, terms) void
    }

    class WildcardAccessMethod {
        -BSONObj _wildcardProjection
        +WildcardAccessMethod(entry, interface)
        +getKeys(obj, keys) void
        -_extractAllPaths(path, elem, keys) void
        -_applyProjection(doc, projection) BSONObj
    }

    class TextIndexAccessMethod {
        -unique_ptr~FTSSpec~ _ftsSpec
        +TextIndexAccessMethod(entry, interface)
        +getKeys(obj, keys) void
        -_tokenizeText(text, tokens) void
        -_buildIndexKeys(tokens, keys) void
    }

    IndexAccessMethod <|-- SortedDataIndexAccessMethod
    SortedDataIndexAccessMethod <|-- BtreeAccessMethod
    SortedDataIndexAccessMethod <|-- HashAccessMethod
    SortedDataIndexAccessMethod <|-- S2AccessMethod
    SortedDataIndexAccessMethod <|-- FTSAccessMethod
    SortedDataIndexAccessMethod <|-- WildcardAccessMethod
    SortedDataIndexAccessMethod <|-- TextIndexAccessMethod

2.3 索引键值处理类图

classDiagram
    class KeyString {
        <<namespace>>
        +Value
        +Builder
        +Discriminator
    }

    class KeyStringValue {
        -SharedBuffer _buffer
        -size_t _size
        -TypeBits _typeBits
        +getBuffer() SharedBuffer
        +getSize() size_t
        +getTypeBits() TypeBits
        +compare(other) int
        +isEmpty() bool
    }

    class KeyStringBuilder {
        -BufBuilder _buffer
        -TypeBitsBuilder _typeBits
        -Ordering _order
        -Version _version
        +appendBSONElement(elem) Builder&
        +appendString(str) Builder&
        +appendNumber(num) Builder&
        +appendRecordId(rid) Builder&
        +getValueCopy() Value
        +release() Value
    }

    class BSONObjSet {
        -set~BSONObj~ _objects
        -BSONObj::Comparator _comparator
        +insert(obj) pair~iterator, bool~
        +find(obj) iterator
        +erase(obj) size_t
        +size() size_t
        +empty() bool
    }

    class RecordId {
        -int64_t _id
        -string _str
        -Type _type
        +RecordId(id)
        +RecordId(str)
        +isLong() bool
        +isStr() bool
        +getLong() int64_t
        +getStr() string
        +compare(other) int
    }

    class MultikeyPaths {
        -vector~set~size_t~~ _paths
        +operator[](index) set~size_t~&
        +size() size_t
        +empty() bool
        +clear() void
    }

    KeyString ..> KeyStringValue : creates
    KeyString ..> KeyStringBuilder : creates
    BSONObjSet --> BSONObj : contains
    KeyStringValue --> RecordId : references
    KeyStringBuilder --> RecordId : appends

2.4 存储接口类图

classDiagram
    class SortedDataInterface {
        <<abstract>>
        +insertKeys(keys, recordIds)* Status
        +removeKeys(keys, recordIds)* void
        +unindex(key, recordId)* void
        +newCursor(opCtx)* unique_ptr~Cursor~
        +makeBulkBuilder(opCtx)* unique_ptr~BulkBuilder~
        +truncate(opCtx)* Status
        +isEmpty(opCtx)* bool
    }

    class SortedDataBuilderInterface {
        <<abstract>>
        +addKey(key, recordId)* Status
        +commit(dupsAllowed)* Status
        +numKeys()* int64_t
    }

    class Cursor {
        <<abstract>>
        +setEndPosition(key, inclusive)* void
        +seek(key, inclusive)* boost::optional~Entry~
        +next()* boost::optional~Entry~
        +seekExact(key)* boost::optional~Entry~
        +save()* void
        +restore()* void
    }

    class Entry {
        +key KeyString::Value
        +recordId RecordId
        +Entry(k, rid)
    }

    class IndexValidateResults {
        +valid bool
        +warnings vector~string~
        +errors vector~string~
        +keysTraversed int64_t
        +numRecords int64_t
    }

    class CompactOptions {
        +dryRun bool
        +paddingFactor double
        +paddingBytes int
        +validateDocuments bool
    }

    SortedDataInterface ..> SortedDataBuilderInterface : creates
    SortedDataInterface ..> Cursor : creates
    Cursor --> Entry : returns
    SortedDataInterface ..> IndexValidateResults : produces
    SortedDataInterface ..> CompactOptions : uses

2.5 索引构建器类图

classDiagram
    class IndexBuilder {
        -CollectionPtr _collection
        -IndexCatalogEntry* _entry
        -string _tempDir
        -size_t _maxMemoryUsage
        -size_t _docsProcessed
        +buildIndex(opCtx, background) Status
        +backgroundBuild(opCtx) Status
        +foregroundBuild(opCtx) Status
        -_scanAndSort(opCtx, sorter) Status
        -_insertSortedKeys(opCtx, sorter) Status
    }

    class MultiIndexBuilder {
        -vector~IndexBuilder*~ _builders
        -ThreadPool _threadPool
        +addBuilder(builder) void
        +buildAllIndexes(opCtx) Status
        -_buildIndexParallel(builder) void
    }

    class BackgroundIndexBuilder {
        -IndexBuildOptions _options
        -BackgroundOperation _backgroundOp
        +startBuild(opCtx) Status
        +resumeBuild(opCtx) Status
        +abortBuild(opCtx) Status
        -_phase1Scan(opCtx) Status
        -_phase2ProcessDeltas(opCtx) Status
        -_phase3Finalize(opCtx) Status
    }

    class IndexBuildCoordinator {
        -map~BuildId, IndexBuild~ _indexBuilds
        -ReplicationCoordinator* _replCoord
        +startIndexBuild(opCtx, spec) BuildId
        +commitIndexBuild(opCtx, buildId) Status
        +abortIndexBuild(opCtx, buildId) Status
        +getIndexBuildState(buildId) IndexBuildState
    }

    class Sorter {
        <<template>>
        -string _tempDir
        -size_t _maxMemoryUsage
        -File _tempFile
        -Settings _settings
        +add(key, value) void
        +done() Iterator*
        +numDataBytes() size_t
    }

    class BSONObjSorter {
        +BSONObjSorter(options)
        +add(key, recordId) void
        +done() Iterator*
    }

    IndexBuilder --> Sorter : uses
    MultiIndexBuilder --> IndexBuilder : manages
    BackgroundIndexBuilder --> IndexBuilder : extends
    IndexBuildCoordinator --> BackgroundIndexBuilder : orchestrates
    IndexBuilder ..> BSONObjSorter : creates
    Sorter <|-- BSONObjSorter

3. 索引类型特定数据结构

3.1 地理索引数据结构

classDiagram
    class S2IndexingParams {
        +int coarsestIndexedLevel
        +int finestIndexedLevel
        +int maxCellsInCovering
        +double radiusOfEarthInRadians
        +S2IndexingParams()
        +configureCoverer(coverer) void
    }

    class S2CellId {
        +uint64 id
        +int level
        +S2CellId(id)
        +S2CellId(point)
        +parent(level) S2CellId
        +child(pos) S2CellId
        +isLeaf() bool
        +toString() string
    }

    class GeoHash {
        +string _hash
        +BSONElement _type
        +double _lng
        +double _lat
        +GeoHash(lng, lat, precision)
        +getHash() string
        +getNeighbors() vector~GeoHash~
    }

    class Point {
        +double x
        +double y
        +Point(x, y)
        +distanceTo(other) double
        +toString() string
    }

    class S2Region {
        <<abstract>>
        +Contains(point)* bool
        +IntersectsCap(cap)* bool
        +GetCapBound()* S2Cap
    }

    class S2Polygon {
        +vector~S2Loop~ loops
        +S2Polygon(loops)
        +Contains(point) bool
        +Intersects(other) bool
        +GetArea() double
    }

    S2IndexingParams ..> S2CellId : configures
    S2Region <|-- S2Polygon
    Point ..> GeoHash : converts

3.2 全文索引数据结构

classDiagram
    class FTSSpec {
        -BSONObj _indexSpec
        -LanguageMap _languageMap
        -map~string, double~ _weights
        -string _defaultLanguage
        +FTSSpec(spec)
        +scoreDocument(doc) TermFrequencyMap
        +getLanguage(doc) string
        +getWeight(field) double
    }

    class TermFrequencyMap {
        -map~string, int~ _terms
        +addTerm(term, frequency) void
        +getFrequency(term) int
        +getTotalTerms() size_t
        +getTerms() vector~string~
    }

    class FTSLanguage {
        -string _languageCode
        -unique_ptr~Stemmer~ _stemmer
        -set~string~ _stopWords
        +FTSLanguage(code)
        +stem(word) string
        +isStopWord(word) bool
        +tokenize(text) vector~string~
    }

    class Stemmer {
        <<abstract>>
        +stem(word)* string
        +getStemmedForm(word)* string
    }

    class TextIndexEntry {
        +string term
        +double score
        +RecordId recordId
        +TextIndexEntry(term, score, id)
        +compare(other) int
    }

    FTSSpec --> TermFrequencyMap : produces
    FTSSpec --> FTSLanguage : uses
    FTSLanguage --> Stemmer : contains
    FTSSpec ..> TextIndexEntry : creates

3.3 哈希索引数据结构

classDiagram
    class HashAccessMethod {
        +static hashElement(elem, seed) long long
        +static makeSingleKey(elem, hashValue, keyPattern) BSONObj
        -_hasher ExpressionHash
    }

    class ExpressionHash {
        -int _seed
        -HashAlgorithm _algorithm
        +ExpressionHash(seed)
        +hash(elem) long long
        +hashString(str) long long
        +hashNumber(num) long long
    }

    class HashSeed {
        +static const int DEFAULT_SEED
        +static getSeed(indexSpec) int
        +static isValidSeed(seed) bool
    }

    class HashedIndexKey {
        +long long hashValue
        +BSONElement originalValue
        +HashedIndexKey(hash, original)
        +toBSON() BSONObj
        +compare(other) int
    }

    HashAccessMethod --> ExpressionHash : uses
    HashAccessMethod --> HashSeed : uses
    HashAccessMethod ..> HashedIndexKey : creates

4. 索引构建过程数据结构

4.1 构建状态管理

classDiagram
    class IndexBuildState {
        <<enumeration>>
        SETUP
        IN_PROGRESS
        DRAINING
        READY
        ABORTED
        COMMITTED
    }

    class IndexBuildBlock {
        -OperationContext* _opCtx
        -IndexCatalogEntry* _entry
        -bool _acquired
        +IndexBuildBlock(opCtx, entry)
        +acquire() void
        +release() void
        +isAcquired() bool
    }

    class IndexBuildsManager {
        -map~BuildId, IndexBuildInfo~ _builds
        -mutex _mutex
        +startBuild(buildId, info) void
        +getBuild(buildId) IndexBuildInfo*
        +finishBuild(buildId) void
        +abortBuild(buildId) void
    }

    class IndexBuildInfo {
        +BuildId buildId
        +NamespaceString nss
        +vector~BSONObj~ indexSpecs
        +IndexBuildState state
        +Timestamp startTimestamp
        +unique_ptr~IndexBuilder~ builder
    }

    class BuildId {
        +UUID id
        +BuildId()
        +BuildId(uuid)
        +toString() string
        +operator==(other) bool
    }

    IndexBuildsManager --> IndexBuildInfo : manages
    IndexBuildInfo --> IndexBuildState : has
    IndexBuildInfo --> BuildId : identified by
    IndexBuildBlock --> IndexCatalogEntry : locks

4.2 外部排序数据结构

classDiagram
    class SorterOptions {
        +string tempDir
        +size_t maxMemoryUsageBytes
        +bool extSortAllowed
        +size_t dbNameSize
        +size_t sorterFileNameMaxLen
        +SorterOptions()
        +TempDir(path) SorterOptions&
        +MaxMemoryUsageBytes(bytes) SorterOptions&
    }

    class SorterStats {
        +size_t memUsed
        +size_t spilledRanges
        +size_t numSpills
        +size_t totalDataSizeBytes
        +Microseconds timeSpentSpilling
        +SorterStats()
        +addSpill(dataSize, time) void
    }

    class SorterFile {
        -string _path
        -File _file
        -bool _isTemp
        +SorterFile(path, temp)
        +open() void
        +close() void
        +write(data, size) void
        +read(buffer, size) size_t
    }

    class SorterIterator {
        <<template>>
        +more() bool
        +next() pair~Key, Value~
        +openSource() void
        +closeSource() void
    }

    class BSONObjSorterIterator {
        -unique_ptr~SorterFile~ _file
        -BSONObj _current
        +more() bool
        +next() pair~BSONObj, RecordId~
        -_advance() void
    }

    SorterOptions --> SorterStats : configures
    Sorter --> SorterFile : uses
    Sorter ..> SorterIterator : creates
    SorterIterator <|-- BSONObjSorterIterator

5. 索引验证数据结构

5.1 验证结果和选项

classDiagram
    class ValidationOptions {
        +bool full
        +bool background
        +bool metadata
        +bool repair
        +ValidationLevel level
        +ValidationOptions()
        +enforceFastCount() bool
    }

    class ValidationLevel {
        <<enumeration>>
        OFF
        MODERATE
        STRICT
    }

    class ValidateResults {
        +bool valid
        +bool repaired
        +vector~string~ warnings
        +vector~string~ errors
        +vector~string~ extraIndexEntries
        +vector~string~ missingIndexEntries
        +BSONObjBuilder details
        +ValidateResults()
        +addError(msg) void
        +addWarning(msg) void
    }

    class IndexConsistency {
        -map~RecordId, BSONObjSet~ _missingIndexEntries
        -multiset~KeyString::Value~ _extraIndexEntries
        -size_t _numKeys
        -size_t _numRecords
        +addDocKey(key, recordId) void
        +addIndexKey(key, recordId) void
        +repairMissingIndexEntries(opCtx) int
        +repairExtraIndexEntries(opCtx) int
    }

    ValidationOptions --> ValidationLevel : uses
    ValidateResults --> ValidationOptions : based on
    IndexConsistency --> ValidateResults : produces

6. 字段映射和约束

6.1 字段到索引的映射关系

数据结构 主要字段 数据类型 约束条件 说明
IndexDescriptor _keyPattern BSONObj 非空,最大32字段 索引键模式定义
IndexDescriptor _indexName string 非空,最大127字符 索引唯一名称
IndexCatalogEntry _isReady bool - 索引是否就绪
KeyString::Value _buffer SharedBuffer 最大1024字节 索引键的二进制表示
RecordId _id int64_t > 0 记录标识符
MultikeyPaths _paths vector<set<size_t» 最大32个路径 多键路径集合