db.go 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175
  1. // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
  2. // All rights reserved.
  3. //
  4. // Use of this source code is governed by a BSD-style license that can be
  5. // found in the LICENSE file.
  6. package leveldb
  7. import (
  8. "container/list"
  9. "fmt"
  10. "io"
  11. "os"
  12. "runtime"
  13. "strings"
  14. "sync"
  15. "sync/atomic"
  16. "time"
  17. "github.com/syndtr/goleveldb/leveldb/errors"
  18. "github.com/syndtr/goleveldb/leveldb/iterator"
  19. "github.com/syndtr/goleveldb/leveldb/journal"
  20. "github.com/syndtr/goleveldb/leveldb/memdb"
  21. "github.com/syndtr/goleveldb/leveldb/opt"
  22. "github.com/syndtr/goleveldb/leveldb/storage"
  23. "github.com/syndtr/goleveldb/leveldb/table"
  24. "github.com/syndtr/goleveldb/leveldb/util"
  25. )
  26. // DB is a LevelDB database.
  27. type DB struct {
  28. // Need 64-bit alignment.
  29. seq uint64
  30. // Stats. Need 64-bit alignment.
  31. cWriteDelay int64 // The cumulative duration of write delays
  32. cWriteDelayN int32 // The cumulative number of write delays
  33. inWritePaused int32 // The indicator whether write operation is paused by compaction
  34. aliveSnaps, aliveIters int32
  35. // Session.
  36. s *session
  37. // MemDB.
  38. memMu sync.RWMutex
  39. memPool chan *memdb.DB
  40. mem, frozenMem *memDB
  41. journal *journal.Writer
  42. journalWriter storage.Writer
  43. journalFd storage.FileDesc
  44. frozenJournalFd storage.FileDesc
  45. frozenSeq uint64
  46. // Snapshot.
  47. snapsMu sync.Mutex
  48. snapsList *list.List
  49. // Write.
  50. batchPool sync.Pool
  51. writeMergeC chan writeMerge
  52. writeMergedC chan bool
  53. writeLockC chan struct{}
  54. writeAckC chan error
  55. writeDelay time.Duration
  56. writeDelayN int
  57. tr *Transaction
  58. // Compaction.
  59. compCommitLk sync.Mutex
  60. tcompCmdC chan cCmd
  61. tcompPauseC chan chan<- struct{}
  62. mcompCmdC chan cCmd
  63. compErrC chan error
  64. compPerErrC chan error
  65. compErrSetC chan error
  66. compWriteLocking bool
  67. compStats cStats
  68. memdbMaxLevel int // For testing.
  69. // Close.
  70. closeW sync.WaitGroup
  71. closeC chan struct{}
  72. closed uint32
  73. closer io.Closer
  74. }
  75. func openDB(s *session) (*DB, error) {
  76. s.log("db@open opening")
  77. start := time.Now()
  78. db := &DB{
  79. s: s,
  80. // Initial sequence
  81. seq: s.stSeqNum,
  82. // MemDB
  83. memPool: make(chan *memdb.DB, 1),
  84. // Snapshot
  85. snapsList: list.New(),
  86. // Write
  87. batchPool: sync.Pool{New: newBatch},
  88. writeMergeC: make(chan writeMerge),
  89. writeMergedC: make(chan bool),
  90. writeLockC: make(chan struct{}, 1),
  91. writeAckC: make(chan error),
  92. // Compaction
  93. tcompCmdC: make(chan cCmd),
  94. tcompPauseC: make(chan chan<- struct{}),
  95. mcompCmdC: make(chan cCmd),
  96. compErrC: make(chan error),
  97. compPerErrC: make(chan error),
  98. compErrSetC: make(chan error),
  99. // Close
  100. closeC: make(chan struct{}),
  101. }
  102. // Read-only mode.
  103. readOnly := s.o.GetReadOnly()
  104. if readOnly {
  105. // Recover journals (read-only mode).
  106. if err := db.recoverJournalRO(); err != nil {
  107. return nil, err
  108. }
  109. } else {
  110. // Recover journals.
  111. if err := db.recoverJournal(); err != nil {
  112. return nil, err
  113. }
  114. // Remove any obsolete files.
  115. if err := db.checkAndCleanFiles(); err != nil {
  116. // Close journal.
  117. if db.journal != nil {
  118. db.journal.Close()
  119. db.journalWriter.Close()
  120. }
  121. return nil, err
  122. }
  123. }
  124. // Doesn't need to be included in the wait group.
  125. go db.compactionError()
  126. go db.mpoolDrain()
  127. if readOnly {
  128. db.SetReadOnly()
  129. } else {
  130. db.closeW.Add(2)
  131. go db.tCompaction()
  132. go db.mCompaction()
  133. // go db.jWriter()
  134. }
  135. s.logf("db@open done T·%v", time.Since(start))
  136. runtime.SetFinalizer(db, (*DB).Close)
  137. return db, nil
  138. }
  139. // Open opens or creates a DB for the given storage.
  140. // The DB will be created if not exist, unless ErrorIfMissing is true.
  141. // Also, if ErrorIfExist is true and the DB exist Open will returns
  142. // os.ErrExist error.
  143. //
  144. // Open will return an error with type of ErrCorrupted if corruption
  145. // detected in the DB. Use errors.IsCorrupted to test whether an error is
  146. // due to corruption. Corrupted DB can be recovered with Recover function.
  147. //
  148. // The returned DB instance is safe for concurrent use.
  149. // The DB must be closed after use, by calling Close method.
  150. func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) {
  151. s, err := newSession(stor, o)
  152. if err != nil {
  153. return
  154. }
  155. defer func() {
  156. if err != nil {
  157. s.close()
  158. s.release()
  159. }
  160. }()
  161. err = s.recover()
  162. if err != nil {
  163. if !os.IsNotExist(err) || s.o.GetErrorIfMissing() {
  164. return
  165. }
  166. err = s.create()
  167. if err != nil {
  168. return
  169. }
  170. } else if s.o.GetErrorIfExist() {
  171. err = os.ErrExist
  172. return
  173. }
  174. return openDB(s)
  175. }
  176. // OpenFile opens or creates a DB for the given path.
  177. // The DB will be created if not exist, unless ErrorIfMissing is true.
  178. // Also, if ErrorIfExist is true and the DB exist OpenFile will returns
  179. // os.ErrExist error.
  180. //
  181. // OpenFile uses standard file-system backed storage implementation as
  182. // described in the leveldb/storage package.
  183. //
  184. // OpenFile will return an error with type of ErrCorrupted if corruption
  185. // detected in the DB. Use errors.IsCorrupted to test whether an error is
  186. // due to corruption. Corrupted DB can be recovered with Recover function.
  187. //
  188. // The returned DB instance is safe for concurrent use.
  189. // The DB must be closed after use, by calling Close method.
  190. func OpenFile(path string, o *opt.Options) (db *DB, err error) {
  191. stor, err := storage.OpenFile(path, o.GetReadOnly())
  192. if err != nil {
  193. return
  194. }
  195. db, err = Open(stor, o)
  196. if err != nil {
  197. stor.Close()
  198. } else {
  199. db.closer = stor
  200. }
  201. return
  202. }
  203. // Recover recovers and opens a DB with missing or corrupted manifest files
  204. // for the given storage. It will ignore any manifest files, valid or not.
  205. // The DB must already exist or it will returns an error.
  206. // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
  207. //
  208. // The returned DB instance is safe for concurrent use.
  209. // The DB must be closed after use, by calling Close method.
  210. func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) {
  211. s, err := newSession(stor, o)
  212. if err != nil {
  213. return
  214. }
  215. defer func() {
  216. if err != nil {
  217. s.close()
  218. s.release()
  219. }
  220. }()
  221. err = recoverTable(s, o)
  222. if err != nil {
  223. return
  224. }
  225. return openDB(s)
  226. }
  227. // RecoverFile recovers and opens a DB with missing or corrupted manifest files
  228. // for the given path. It will ignore any manifest files, valid or not.
  229. // The DB must already exist or it will returns an error.
  230. // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
  231. //
  232. // RecoverFile uses standard file-system backed storage implementation as described
  233. // in the leveldb/storage package.
  234. //
  235. // The returned DB instance is safe for concurrent use.
  236. // The DB must be closed after use, by calling Close method.
  237. func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
  238. stor, err := storage.OpenFile(path, false)
  239. if err != nil {
  240. return
  241. }
  242. db, err = Recover(stor, o)
  243. if err != nil {
  244. stor.Close()
  245. } else {
  246. db.closer = stor
  247. }
  248. return
  249. }
  250. func recoverTable(s *session, o *opt.Options) error {
  251. o = dupOptions(o)
  252. // Mask StrictReader, lets StrictRecovery doing its job.
  253. o.Strict &= ^opt.StrictReader
  254. // Get all tables and sort it by file number.
  255. fds, err := s.stor.List(storage.TypeTable)
  256. if err != nil {
  257. return err
  258. }
  259. sortFds(fds)
  260. var (
  261. maxSeq uint64
  262. recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
  263. // We will drop corrupted table.
  264. strict = o.GetStrict(opt.StrictRecovery)
  265. noSync = o.GetNoSync()
  266. rec = &sessionRecord{}
  267. bpool = util.NewBufferPool(o.GetBlockSize() + 5)
  268. )
  269. buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) {
  270. tmpFd = s.newTemp()
  271. writer, err := s.stor.Create(tmpFd)
  272. if err != nil {
  273. return
  274. }
  275. defer func() {
  276. writer.Close()
  277. if err != nil {
  278. s.stor.Remove(tmpFd)
  279. tmpFd = storage.FileDesc{}
  280. }
  281. }()
  282. // Copy entries.
  283. tw := table.NewWriter(writer, o)
  284. for iter.Next() {
  285. key := iter.Key()
  286. if validInternalKey(key) {
  287. err = tw.Append(key, iter.Value())
  288. if err != nil {
  289. return
  290. }
  291. }
  292. }
  293. err = iter.Error()
  294. if err != nil && !errors.IsCorrupted(err) {
  295. return
  296. }
  297. err = tw.Close()
  298. if err != nil {
  299. return
  300. }
  301. if !noSync {
  302. err = writer.Sync()
  303. if err != nil {
  304. return
  305. }
  306. }
  307. size = int64(tw.BytesLen())
  308. return
  309. }
  310. recoverTable := func(fd storage.FileDesc) error {
  311. s.logf("table@recovery recovering @%d", fd.Num)
  312. reader, err := s.stor.Open(fd)
  313. if err != nil {
  314. return err
  315. }
  316. var closed bool
  317. defer func() {
  318. if !closed {
  319. reader.Close()
  320. }
  321. }()
  322. // Get file size.
  323. size, err := reader.Seek(0, 2)
  324. if err != nil {
  325. return err
  326. }
  327. var (
  328. tSeq uint64
  329. tgoodKey, tcorruptedKey, tcorruptedBlock int
  330. imin, imax []byte
  331. )
  332. tr, err := table.NewReader(reader, size, fd, nil, bpool, o)
  333. if err != nil {
  334. return err
  335. }
  336. iter := tr.NewIterator(nil, nil)
  337. if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok {
  338. itererr.SetErrorCallback(func(err error) {
  339. if errors.IsCorrupted(err) {
  340. s.logf("table@recovery block corruption @%d %q", fd.Num, err)
  341. tcorruptedBlock++
  342. }
  343. })
  344. }
  345. // Scan the table.
  346. for iter.Next() {
  347. key := iter.Key()
  348. _, seq, _, kerr := parseInternalKey(key)
  349. if kerr != nil {
  350. tcorruptedKey++
  351. continue
  352. }
  353. tgoodKey++
  354. if seq > tSeq {
  355. tSeq = seq
  356. }
  357. if imin == nil {
  358. imin = append([]byte{}, key...)
  359. }
  360. imax = append(imax[:0], key...)
  361. }
  362. if err := iter.Error(); err != nil && !errors.IsCorrupted(err) {
  363. iter.Release()
  364. return err
  365. }
  366. iter.Release()
  367. goodKey += tgoodKey
  368. corruptedKey += tcorruptedKey
  369. corruptedBlock += tcorruptedBlock
  370. if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
  371. droppedTable++
  372. s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
  373. return nil
  374. }
  375. if tgoodKey > 0 {
  376. if tcorruptedKey > 0 || tcorruptedBlock > 0 {
  377. // Rebuild the table.
  378. s.logf("table@recovery rebuilding @%d", fd.Num)
  379. iter := tr.NewIterator(nil, nil)
  380. tmpFd, newSize, err := buildTable(iter)
  381. iter.Release()
  382. if err != nil {
  383. return err
  384. }
  385. closed = true
  386. reader.Close()
  387. if err := s.stor.Rename(tmpFd, fd); err != nil {
  388. return err
  389. }
  390. size = newSize
  391. }
  392. if tSeq > maxSeq {
  393. maxSeq = tSeq
  394. }
  395. recoveredKey += tgoodKey
  396. // Add table to level 0.
  397. rec.addTable(0, fd.Num, size, imin, imax)
  398. s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
  399. } else {
  400. droppedTable++
  401. s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size)
  402. }
  403. return nil
  404. }
  405. // Recover all tables.
  406. if len(fds) > 0 {
  407. s.logf("table@recovery F·%d", len(fds))
  408. // Mark file number as used.
  409. s.markFileNum(fds[len(fds)-1].Num)
  410. for _, fd := range fds {
  411. if err := recoverTable(fd); err != nil {
  412. return err
  413. }
  414. }
  415. s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq)
  416. }
  417. // Set sequence number.
  418. rec.setSeqNum(maxSeq)
  419. // Create new manifest.
  420. if err := s.create(); err != nil {
  421. return err
  422. }
  423. // Commit.
  424. return s.commit(rec)
  425. }
  426. func (db *DB) recoverJournal() error {
  427. // Get all journals and sort it by file number.
  428. rawFds, err := db.s.stor.List(storage.TypeJournal)
  429. if err != nil {
  430. return err
  431. }
  432. sortFds(rawFds)
  433. // Journals that will be recovered.
  434. var fds []storage.FileDesc
  435. for _, fd := range rawFds {
  436. if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
  437. fds = append(fds, fd)
  438. }
  439. }
  440. var (
  441. ofd storage.FileDesc // Obsolete file.
  442. rec = &sessionRecord{}
  443. )
  444. // Recover journals.
  445. if len(fds) > 0 {
  446. db.logf("journal@recovery F·%d", len(fds))
  447. // Mark file number as used.
  448. db.s.markFileNum(fds[len(fds)-1].Num)
  449. var (
  450. // Options.
  451. strict = db.s.o.GetStrict(opt.StrictJournal)
  452. checksum = db.s.o.GetStrict(opt.StrictJournalChecksum)
  453. writeBuffer = db.s.o.GetWriteBuffer()
  454. jr *journal.Reader
  455. mdb = memdb.New(db.s.icmp, writeBuffer)
  456. buf = &util.Buffer{}
  457. batchSeq uint64
  458. batchLen int
  459. )
  460. for _, fd := range fds {
  461. db.logf("journal@recovery recovering @%d", fd.Num)
  462. fr, err := db.s.stor.Open(fd)
  463. if err != nil {
  464. return err
  465. }
  466. // Create or reset journal reader instance.
  467. if jr == nil {
  468. jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
  469. } else {
  470. jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
  471. }
  472. // Flush memdb and remove obsolete journal file.
  473. if !ofd.Zero() {
  474. if mdb.Len() > 0 {
  475. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  476. fr.Close()
  477. return err
  478. }
  479. }
  480. rec.setJournalNum(fd.Num)
  481. rec.setSeqNum(db.seq)
  482. if err := db.s.commit(rec); err != nil {
  483. fr.Close()
  484. return err
  485. }
  486. rec.resetAddedTables()
  487. db.s.stor.Remove(ofd)
  488. ofd = storage.FileDesc{}
  489. }
  490. // Replay journal to memdb.
  491. mdb.Reset()
  492. for {
  493. r, err := jr.Next()
  494. if err != nil {
  495. if err == io.EOF {
  496. break
  497. }
  498. fr.Close()
  499. return errors.SetFd(err, fd)
  500. }
  501. buf.Reset()
  502. if _, err := buf.ReadFrom(r); err != nil {
  503. if err == io.ErrUnexpectedEOF {
  504. // This is error returned due to corruption, with strict == false.
  505. continue
  506. }
  507. fr.Close()
  508. return errors.SetFd(err, fd)
  509. }
  510. batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
  511. if err != nil {
  512. if !strict && errors.IsCorrupted(err) {
  513. db.s.logf("journal error: %v (skipped)", err)
  514. // We won't apply sequence number as it might be corrupted.
  515. continue
  516. }
  517. fr.Close()
  518. return errors.SetFd(err, fd)
  519. }
  520. // Save sequence number.
  521. db.seq = batchSeq + uint64(batchLen)
  522. // Flush it if large enough.
  523. if mdb.Size() >= writeBuffer {
  524. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  525. fr.Close()
  526. return err
  527. }
  528. mdb.Reset()
  529. }
  530. }
  531. fr.Close()
  532. ofd = fd
  533. }
  534. // Flush the last memdb.
  535. if mdb.Len() > 0 {
  536. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  537. return err
  538. }
  539. }
  540. }
  541. // Create a new journal.
  542. if _, err := db.newMem(0); err != nil {
  543. return err
  544. }
  545. // Commit.
  546. rec.setJournalNum(db.journalFd.Num)
  547. rec.setSeqNum(db.seq)
  548. if err := db.s.commit(rec); err != nil {
  549. // Close journal on error.
  550. if db.journal != nil {
  551. db.journal.Close()
  552. db.journalWriter.Close()
  553. }
  554. return err
  555. }
  556. // Remove the last obsolete journal file.
  557. if !ofd.Zero() {
  558. db.s.stor.Remove(ofd)
  559. }
  560. return nil
  561. }
  562. func (db *DB) recoverJournalRO() error {
  563. // Get all journals and sort it by file number.
  564. rawFds, err := db.s.stor.List(storage.TypeJournal)
  565. if err != nil {
  566. return err
  567. }
  568. sortFds(rawFds)
  569. // Journals that will be recovered.
  570. var fds []storage.FileDesc
  571. for _, fd := range rawFds {
  572. if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
  573. fds = append(fds, fd)
  574. }
  575. }
  576. var (
  577. // Options.
  578. strict = db.s.o.GetStrict(opt.StrictJournal)
  579. checksum = db.s.o.GetStrict(opt.StrictJournalChecksum)
  580. writeBuffer = db.s.o.GetWriteBuffer()
  581. mdb = memdb.New(db.s.icmp, writeBuffer)
  582. )
  583. // Recover journals.
  584. if len(fds) > 0 {
  585. db.logf("journal@recovery RO·Mode F·%d", len(fds))
  586. var (
  587. jr *journal.Reader
  588. buf = &util.Buffer{}
  589. batchSeq uint64
  590. batchLen int
  591. )
  592. for _, fd := range fds {
  593. db.logf("journal@recovery recovering @%d", fd.Num)
  594. fr, err := db.s.stor.Open(fd)
  595. if err != nil {
  596. return err
  597. }
  598. // Create or reset journal reader instance.
  599. if jr == nil {
  600. jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
  601. } else {
  602. jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
  603. }
  604. // Replay journal to memdb.
  605. for {
  606. r, err := jr.Next()
  607. if err != nil {
  608. if err == io.EOF {
  609. break
  610. }
  611. fr.Close()
  612. return errors.SetFd(err, fd)
  613. }
  614. buf.Reset()
  615. if _, err := buf.ReadFrom(r); err != nil {
  616. if err == io.ErrUnexpectedEOF {
  617. // This is error returned due to corruption, with strict == false.
  618. continue
  619. }
  620. fr.Close()
  621. return errors.SetFd(err, fd)
  622. }
  623. batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
  624. if err != nil {
  625. if !strict && errors.IsCorrupted(err) {
  626. db.s.logf("journal error: %v (skipped)", err)
  627. // We won't apply sequence number as it might be corrupted.
  628. continue
  629. }
  630. fr.Close()
  631. return errors.SetFd(err, fd)
  632. }
  633. // Save sequence number.
  634. db.seq = batchSeq + uint64(batchLen)
  635. }
  636. fr.Close()
  637. }
  638. }
  639. // Set memDB.
  640. db.mem = &memDB{db: db, DB: mdb, ref: 1}
  641. return nil
  642. }
  643. func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) {
  644. mk, mv, err := mdb.Find(ikey)
  645. if err == nil {
  646. ukey, _, kt, kerr := parseInternalKey(mk)
  647. if kerr != nil {
  648. // Shouldn't have had happen.
  649. panic(kerr)
  650. }
  651. if icmp.uCompare(ukey, ikey.ukey()) == 0 {
  652. if kt == keyTypeDel {
  653. return true, nil, ErrNotFound
  654. }
  655. return true, mv, nil
  656. }
  657. } else if err != ErrNotFound {
  658. return true, nil, err
  659. }
  660. return
  661. }
  662. func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
  663. ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
  664. if auxm != nil {
  665. if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok {
  666. return append([]byte{}, mv...), me
  667. }
  668. }
  669. em, fm := db.getMems()
  670. for _, m := range [...]*memDB{em, fm} {
  671. if m == nil {
  672. continue
  673. }
  674. defer m.decref()
  675. if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok {
  676. return append([]byte{}, mv...), me
  677. }
  678. }
  679. v := db.s.version()
  680. value, cSched, err := v.get(auxt, ikey, ro, false)
  681. v.release()
  682. if cSched {
  683. // Trigger table compaction.
  684. db.compTrigger(db.tcompCmdC)
  685. }
  686. return
  687. }
  688. func nilIfNotFound(err error) error {
  689. if err == ErrNotFound {
  690. return nil
  691. }
  692. return err
  693. }
  694. func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) {
  695. ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
  696. if auxm != nil {
  697. if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok {
  698. return me == nil, nilIfNotFound(me)
  699. }
  700. }
  701. em, fm := db.getMems()
  702. for _, m := range [...]*memDB{em, fm} {
  703. if m == nil {
  704. continue
  705. }
  706. defer m.decref()
  707. if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok {
  708. return me == nil, nilIfNotFound(me)
  709. }
  710. }
  711. v := db.s.version()
  712. _, cSched, err := v.get(auxt, ikey, ro, true)
  713. v.release()
  714. if cSched {
  715. // Trigger table compaction.
  716. db.compTrigger(db.tcompCmdC)
  717. }
  718. if err == nil {
  719. ret = true
  720. } else if err == ErrNotFound {
  721. err = nil
  722. }
  723. return
  724. }
  725. // Get gets the value for the given key. It returns ErrNotFound if the
  726. // DB does not contains the key.
  727. //
  728. // The returned slice is its own copy, it is safe to modify the contents
  729. // of the returned slice.
  730. // It is safe to modify the contents of the argument after Get returns.
  731. func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) {
  732. err = db.ok()
  733. if err != nil {
  734. return
  735. }
  736. se := db.acquireSnapshot()
  737. defer db.releaseSnapshot(se)
  738. return db.get(nil, nil, key, se.seq, ro)
  739. }
  740. // Has returns true if the DB does contains the given key.
  741. //
  742. // It is safe to modify the contents of the argument after Has returns.
  743. func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) {
  744. err = db.ok()
  745. if err != nil {
  746. return
  747. }
  748. se := db.acquireSnapshot()
  749. defer db.releaseSnapshot(se)
  750. return db.has(nil, nil, key, se.seq, ro)
  751. }
  752. // NewIterator returns an iterator for the latest snapshot of the
  753. // underlying DB.
  754. // The returned iterator is not safe for concurrent use, but it is safe to use
  755. // multiple iterators concurrently, with each in a dedicated goroutine.
  756. // It is also safe to use an iterator concurrently with modifying its
  757. // underlying DB. The resultant key/value pairs are guaranteed to be
  758. // consistent.
  759. //
  760. // Slice allows slicing the iterator to only contains keys in the given
  761. // range. A nil Range.Start is treated as a key before all keys in the
  762. // DB. And a nil Range.Limit is treated as a key after all keys in
  763. // the DB.
  764. //
  765. // The iterator must be released after use, by calling Release method.
  766. //
  767. // Also read Iterator documentation of the leveldb/iterator package.
  768. func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
  769. if err := db.ok(); err != nil {
  770. return iterator.NewEmptyIterator(err)
  771. }
  772. se := db.acquireSnapshot()
  773. defer db.releaseSnapshot(se)
  774. // Iterator holds 'version' lock, 'version' is immutable so snapshot
  775. // can be released after iterator created.
  776. return db.newIterator(nil, nil, se.seq, slice, ro)
  777. }
  778. // GetSnapshot returns a latest snapshot of the underlying DB. A snapshot
  779. // is a frozen snapshot of a DB state at a particular point in time. The
  780. // content of snapshot are guaranteed to be consistent.
  781. //
  782. // The snapshot must be released after use, by calling Release method.
  783. func (db *DB) GetSnapshot() (*Snapshot, error) {
  784. if err := db.ok(); err != nil {
  785. return nil, err
  786. }
  787. return db.newSnapshot(), nil
  788. }
  789. // GetProperty returns value of the given property name.
  790. //
  791. // Property names:
  792. // leveldb.num-files-at-level{n}
  793. // Returns the number of files at level 'n'.
  794. // leveldb.stats
  795. // Returns statistics of the underlying DB.
  796. // leveldb.iostats
  797. // Returns statistics of effective disk read and write.
  798. // leveldb.writedelay
  799. // Returns cumulative write delay caused by compaction.
  800. // leveldb.sstables
  801. // Returns sstables list for each level.
  802. // leveldb.blockpool
  803. // Returns block pool stats.
  804. // leveldb.cachedblock
  805. // Returns size of cached block.
  806. // leveldb.openedtables
  807. // Returns number of opened tables.
  808. // leveldb.alivesnaps
  809. // Returns number of alive snapshots.
  810. // leveldb.aliveiters
  811. // Returns number of alive iterators.
  812. func (db *DB) GetProperty(name string) (value string, err error) {
  813. err = db.ok()
  814. if err != nil {
  815. return
  816. }
  817. const prefix = "leveldb."
  818. if !strings.HasPrefix(name, prefix) {
  819. return "", ErrNotFound
  820. }
  821. p := name[len(prefix):]
  822. v := db.s.version()
  823. defer v.release()
  824. numFilesPrefix := "num-files-at-level"
  825. switch {
  826. case strings.HasPrefix(p, numFilesPrefix):
  827. var level uint
  828. var rest string
  829. n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
  830. if n != 1 {
  831. err = ErrNotFound
  832. } else {
  833. value = fmt.Sprint(v.tLen(int(level)))
  834. }
  835. case p == "stats":
  836. value = "Compactions\n" +
  837. " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" +
  838. "-------+------------+---------------+---------------+---------------+---------------\n"
  839. for level, tables := range v.levels {
  840. duration, read, write := db.compStats.getStat(level)
  841. if len(tables) == 0 && duration == 0 {
  842. continue
  843. }
  844. value += fmt.Sprintf(" %3d | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n",
  845. level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(),
  846. float64(read)/1048576.0, float64(write)/1048576.0)
  847. }
  848. case p == "iostats":
  849. value = fmt.Sprintf("Read(MB):%.5f Write(MB):%.5f",
  850. float64(db.s.stor.reads())/1048576.0,
  851. float64(db.s.stor.writes())/1048576.0)
  852. case p == "writedelay":
  853. writeDelayN, writeDelay := atomic.LoadInt32(&db.cWriteDelayN), time.Duration(atomic.LoadInt64(&db.cWriteDelay))
  854. paused := atomic.LoadInt32(&db.inWritePaused) == 1
  855. value = fmt.Sprintf("DelayN:%d Delay:%s Paused:%t", writeDelayN, writeDelay, paused)
  856. case p == "sstables":
  857. for level, tables := range v.levels {
  858. value += fmt.Sprintf("--- level %d ---\n", level)
  859. for _, t := range tables {
  860. value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax)
  861. }
  862. }
  863. case p == "blockpool":
  864. value = fmt.Sprintf("%v", db.s.tops.bpool)
  865. case p == "cachedblock":
  866. if db.s.tops.bcache != nil {
  867. value = fmt.Sprintf("%d", db.s.tops.bcache.Size())
  868. } else {
  869. value = "<nil>"
  870. }
  871. case p == "openedtables":
  872. value = fmt.Sprintf("%d", db.s.tops.cache.Size())
  873. case p == "alivesnaps":
  874. value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps))
  875. case p == "aliveiters":
  876. value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters))
  877. default:
  878. err = ErrNotFound
  879. }
  880. return
  881. }
  882. // DBStats is database statistics.
  883. type DBStats struct {
  884. WriteDelayCount int32
  885. WriteDelayDuration time.Duration
  886. WritePaused bool
  887. AliveSnapshots int32
  888. AliveIterators int32
  889. IOWrite uint64
  890. IORead uint64
  891. BlockCacheSize int
  892. OpenedTablesCount int
  893. LevelSizes []int64
  894. LevelTablesCounts []int
  895. LevelRead []int64
  896. LevelWrite []int64
  897. LevelDurations []time.Duration
  898. }
  899. // Stats populates s with database statistics.
  900. func (db *DB) Stats(s *DBStats) error {
  901. err := db.ok()
  902. if err != nil {
  903. return err
  904. }
  905. s.IORead = db.s.stor.reads()
  906. s.IOWrite = db.s.stor.writes()
  907. s.WriteDelayCount = atomic.LoadInt32(&db.cWriteDelayN)
  908. s.WriteDelayDuration = time.Duration(atomic.LoadInt64(&db.cWriteDelay))
  909. s.WritePaused = atomic.LoadInt32(&db.inWritePaused) == 1
  910. s.OpenedTablesCount = db.s.tops.cache.Size()
  911. if db.s.tops.bcache != nil {
  912. s.BlockCacheSize = db.s.tops.bcache.Size()
  913. } else {
  914. s.BlockCacheSize = 0
  915. }
  916. s.AliveIterators = atomic.LoadInt32(&db.aliveIters)
  917. s.AliveSnapshots = atomic.LoadInt32(&db.aliveSnaps)
  918. s.LevelDurations = s.LevelDurations[:0]
  919. s.LevelRead = s.LevelRead[:0]
  920. s.LevelWrite = s.LevelWrite[:0]
  921. s.LevelSizes = s.LevelSizes[:0]
  922. s.LevelTablesCounts = s.LevelTablesCounts[:0]
  923. v := db.s.version()
  924. defer v.release()
  925. for level, tables := range v.levels {
  926. duration, read, write := db.compStats.getStat(level)
  927. if len(tables) == 0 && duration == 0 {
  928. continue
  929. }
  930. s.LevelDurations = append(s.LevelDurations, duration)
  931. s.LevelRead = append(s.LevelRead, read)
  932. s.LevelWrite = append(s.LevelWrite, write)
  933. s.LevelSizes = append(s.LevelSizes, tables.size())
  934. s.LevelTablesCounts = append(s.LevelTablesCounts, len(tables))
  935. }
  936. return nil
  937. }
  938. // SizeOf calculates approximate sizes of the given key ranges.
  939. // The length of the returned sizes are equal with the length of the given
  940. // ranges. The returned sizes measure storage space usage, so if the user
  941. // data compresses by a factor of ten, the returned sizes will be one-tenth
  942. // the size of the corresponding user data size.
  943. // The results may not include the sizes of recently written data.
  944. func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
  945. if err := db.ok(); err != nil {
  946. return nil, err
  947. }
  948. v := db.s.version()
  949. defer v.release()
  950. sizes := make(Sizes, 0, len(ranges))
  951. for _, r := range ranges {
  952. imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek)
  953. imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek)
  954. start, err := v.offsetOf(imin)
  955. if err != nil {
  956. return nil, err
  957. }
  958. limit, err := v.offsetOf(imax)
  959. if err != nil {
  960. return nil, err
  961. }
  962. var size int64
  963. if limit >= start {
  964. size = limit - start
  965. }
  966. sizes = append(sizes, size)
  967. }
  968. return sizes, nil
  969. }
  970. // Close closes the DB. This will also releases any outstanding snapshot,
  971. // abort any in-flight compaction and discard open transaction.
  972. //
  973. // It is not safe to close a DB until all outstanding iterators are released.
  974. // It is valid to call Close multiple times. Other methods should not be
  975. // called after the DB has been closed.
  976. func (db *DB) Close() error {
  977. if !db.setClosed() {
  978. return ErrClosed
  979. }
  980. start := time.Now()
  981. db.log("db@close closing")
  982. // Clear the finalizer.
  983. runtime.SetFinalizer(db, nil)
  984. // Get compaction error.
  985. var err error
  986. select {
  987. case err = <-db.compErrC:
  988. if err == ErrReadOnly {
  989. err = nil
  990. }
  991. default:
  992. }
  993. // Signal all goroutines.
  994. close(db.closeC)
  995. // Discard open transaction.
  996. if db.tr != nil {
  997. db.tr.Discard()
  998. }
  999. // Acquire writer lock.
  1000. db.writeLockC <- struct{}{}
  1001. // Wait for all gorotines to exit.
  1002. db.closeW.Wait()
  1003. // Closes journal.
  1004. if db.journal != nil {
  1005. db.journal.Close()
  1006. db.journalWriter.Close()
  1007. db.journal = nil
  1008. db.journalWriter = nil
  1009. }
  1010. if db.writeDelayN > 0 {
  1011. db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
  1012. }
  1013. // Close session.
  1014. db.s.close()
  1015. db.logf("db@close done T·%v", time.Since(start))
  1016. db.s.release()
  1017. if db.closer != nil {
  1018. if err1 := db.closer.Close(); err == nil {
  1019. err = err1
  1020. }
  1021. db.closer = nil
  1022. }
  1023. // Clear memdbs.
  1024. db.clearMems()
  1025. return err
  1026. }