forked from mysql/mysql-server
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuf0buf.cc
7023 lines (5668 loc) · 188 KB
/
buf0buf.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*****************************************************************************
Copyright (c) 1995, 2023, Oracle and/or its affiliates.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
briefly in the InnoDB documentation. The contributions by Google are
incorporated with their permission, and subject to the conditions contained in
the file COPYING.Google.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file buf/buf0buf.cc
The database buffer buf_pool
Created 11/5/1995 Heikki Tuuri
*******************************************************/
#include "ha_prototypes.h"
#include "page0size.h"
#include "buf0buf.h"
#ifdef UNIV_NONINL
#include "buf0buf.ic"
#endif
#ifdef UNIV_INNOCHECKSUM
#include "string.h"
#include "mach0data.h"
#endif /* UNIV_INNOCHECKSUM */
#ifndef UNIV_INNOCHECKSUM
#include "mem0mem.h"
#include "btr0btr.h"
#include "fil0fil.h"
#include "fsp0sysspace.h"
#ifndef UNIV_HOTBACKUP
#include "buf0buddy.h"
#include "lock0lock.h"
#include "sync0rw.h"
#include "btr0sea.h"
#include "ibuf0ibuf.h"
#include "trx0undo.h"
#include "trx0purge.h"
#include "log0log.h"
#include "dict0stats_bg.h"
#endif /* !UNIV_HOTBACKUP */
#include "srv0srv.h"
#include "srv0start.h"
#include "dict0dict.h"
#include "log0recv.h"
#include "srv0mon.h"
#include "fsp0sysspace.h"
#endif /* !UNIV_INNOCHECKSUM */
#include "page0zip.h"
#include "buf0checksum.h"
#include "sync0sync.h"
#include "buf0dump.h"
#include "ut0new.h"
#include <new>
#include <map>
#include <sstream>
my_bool srv_numa_interleave = FALSE;
#ifdef HAVE_LIBNUMA
#include <numa.h>
#include <numaif.h>
struct set_numa_interleave_t
{
set_numa_interleave_t()
{
if (srv_numa_interleave) {
ib::info() << "Setting NUMA memory policy to"
" MPOL_INTERLEAVE";
struct bitmask* numa_nodes = numa_get_mems_allowed();
if (set_mempolicy(MPOL_INTERLEAVE,
numa_nodes->maskp,
numa_nodes->size) != 0) {
ib::warn() << "Failed to set NUMA memory"
" policy to MPOL_INTERLEAVE: "
<< strerror(errno);
}
numa_bitmask_free(numa_nodes);
}
}
~set_numa_interleave_t()
{
if (srv_numa_interleave) {
ib::info() << "Setting NUMA memory policy to"
" MPOL_DEFAULT";
if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
ib::warn() << "Failed to set NUMA memory"
" policy to MPOL_DEFAULT: "
<< strerror(errno);
}
}
}
};
#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
#else
#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
#endif /* HAVE_LIBNUMA */
/*
IMPLEMENTATION OF THE BUFFER POOL
=================================
Performance improvement:
------------------------
Thread scheduling in NT may be so slow that the OS wait mechanism should
not be used even in waiting for disk reads to complete.
Rather, we should put waiting query threads to the queue of
waiting jobs, and let the OS thread do something useful while the i/o
is processed. In this way we could remove most OS thread switches in
an i/o-intensive benchmark like TPC-C.
A possibility is to put a user space thread library between the database
and NT. User space thread libraries might be very fast.
SQL Server 7.0 can be configured to use 'fibers' which are lightweight
threads in NT. These should be studied.
Buffer frames and blocks
------------------------
Following the terminology of Gray and Reuter, we call the memory
blocks where file pages are loaded buffer frames. For each buffer
frame there is a control block, or shortly, a block, in the buffer
control array. The control info which does not need to be stored
in the file along with the file page, resides in the control block.
Buffer pool struct
------------------
The buffer buf_pool contains a single mutex which protects all the
control data structures of the buf_pool. The content of a buffer frame is
protected by a separate read-write lock in its control block, though.
These locks can be locked and unlocked without owning the buf_pool->mutex.
The OS events in the buf_pool struct can be waited for without owning the
buf_pool->mutex.
The buf_pool->mutex is a hot-spot in main memory, causing a lot of
memory bus traffic on multiprocessor systems when processors
alternately access the mutex. On our Pentium, the mutex is accessed
maybe every 10 microseconds. We gave up the solution to have mutexes
for each control block, for instance, because it seemed to be
complicated.
A solution to reduce mutex contention of the buf_pool->mutex is to
create a separate mutex for the page hash table. On Pentium,
accessing the hash table takes 2 microseconds, about half
of the total buf_pool->mutex hold time.
Control blocks
--------------
The control block contains, for instance, the bufferfix count
which is incremented when a thread wants a file page to be fixed
in a buffer frame. The bufferfix operation does not lock the
contents of the frame, however. For this purpose, the control
block contains a read-write lock.
The buffer frames have to be aligned so that the start memory
address of a frame is divisible by the universal page size, which
is a power of two.
We intend to make the buffer buf_pool size on-line reconfigurable,
that is, the buf_pool size can be changed without closing the database.
Then the database administarator may adjust it to be bigger
at night, for example. The control block array must
contain enough control blocks for the maximum buffer buf_pool size
which is used in the particular database.
If the buf_pool size is cut, we exploit the virtual memory mechanism of
the OS, and just refrain from using frames at high addresses. Then the OS
can swap them to disk.
The control blocks containing file pages are put to a hash table
according to the file address of the page.
We could speed up the access to an individual page by using
"pointer swizzling": we could replace the page references on
non-leaf index pages by direct pointers to the page, if it exists
in the buf_pool. We could make a separate hash table where we could
chain all the page references in non-leaf pages residing in the buf_pool,
using the page reference as the hash key,
and at the time of reading of a page update the pointers accordingly.
Drawbacks of this solution are added complexity and,
possibly, extra space required on non-leaf pages for memory pointers.
A simpler solution is just to speed up the hash table mechanism
in the database, using tables whose size is a power of 2.
Lists of blocks
---------------
There are several lists of control blocks.
The free list (buf_pool->free) contains blocks which are currently not
used.
The common LRU list contains all the blocks holding a file page
except those for which the bufferfix count is non-zero.
The pages are in the LRU list roughly in the order of the last
access to the page, so that the oldest pages are at the end of the
list. We also keep a pointer to near the end of the LRU list,
which we can use when we want to artificially age a page in the
buf_pool. This is used if we know that some page is not needed
again for some time: we insert the block right after the pointer,
causing it to be replaced sooner than would normally be the case.
Currently this aging mechanism is used for read-ahead mechanism
of pages, and it can also be used when there is a scan of a full
table which cannot fit in the memory. Putting the pages near the
end of the LRU list, we make sure that most of the buf_pool stays
in the main memory, undisturbed.
The unzip_LRU list contains a subset of the common LRU list. The
blocks on the unzip_LRU list hold a compressed file page and the
corresponding uncompressed page frame. A block is in unzip_LRU if and
only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
holds. The blocks in unzip_LRU will be in same order as they are in
the common LRU list. That is, each manipulation of the common LRU
list will result in the same manipulation of the unzip_LRU list.
The chain of modified blocks (buf_pool->flush_list) contains the blocks
holding file pages that have been modified in the memory
but not written to disk yet. The block with the oldest modification
which has not yet been written to disk is at the end of the chain.
The access to this list is protected by buf_pool->flush_list_mutex.
The chain of unmodified compressed blocks (buf_pool->zip_clean)
contains the control blocks (buf_page_t) of those compressed pages
that are not in buf_pool->flush_list and for which no uncompressed
page has been allocated in the buffer pool. The control blocks for
uncompressed pages are accessible via buf_block_t objects that are
reachable via buf_pool->chunks[].
The chains of free memory blocks (buf_pool->zip_free[]) are used by
the buddy allocator (buf0buddy.cc) to keep track of currently unused
memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
pool. The buddy allocator is solely used for allocating control
blocks for compressed pages (buf_page_t) and compressed page frames.
Loading a file page
-------------------
First, a victim block for replacement has to be found in the
buf_pool. It is taken from the free list or searched for from the
end of the LRU-list. An exclusive lock is reserved for the frame,
the io_fix field is set in the block fixing the block in buf_pool,
and the io-operation for loading the page is queued. The io-handler thread
releases the X-lock on the frame and resets the io_fix field
when the io operation completes.
A thread may request the above operation using the function
buf_page_get(). It may then continue to request a lock on the frame.
The lock is granted when the io-handler releases the x-lock.
Read-ahead
----------
The read-ahead mechanism is intended to be intelligent and
isolated from the semantically higher levels of the database
index management. From the higher level we only need the
information if a file page has a natural successor or
predecessor page. On the leaf level of a B-tree index,
these are the next and previous pages in the natural
order of the pages.
Let us first explain the read-ahead mechanism when the leafs
of a B-tree are scanned in an ascending or descending order.
When a read page is the first time referenced in the buf_pool,
the buffer manager checks if it is at the border of a so-called
linear read-ahead area. The tablespace is divided into these
areas of size 64 blocks, for example. So if the page is at the
border of such an area, the read-ahead mechanism checks if
all the other blocks in the area have been accessed in an
ascending or descending order. If this is the case, the system
looks at the natural successor or predecessor of the page,
checks if that is at the border of another area, and in this case
issues read-requests for all the pages in that area. Maybe
we could relax the condition that all the pages in the area
have to be accessed: if data is deleted from a table, there may
appear holes of unused pages in the area.
A different read-ahead mechanism is used when there appears
to be a random access pattern to a file.
If a new page is referenced in the buf_pool, and several pages
of its random access area (for instance, 32 consecutive pages
in a tablespace) have recently been referenced, we may predict
that the whole area may be needed in the near future, and issue
the read requests for the whole area.
*/
#if (!(defined(UNIV_HOTBACKUP) || defined(UNIV_INNOCHECKSUM)))
/** Value in microseconds */
static const int WAIT_FOR_READ = 100;
static const int WAIT_FOR_WRITE = 100;
/** Number of attempts made to read in a page in the buffer pool */
static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
/** Number of pages to read ahead */
static const ulint BUF_READ_AHEAD_PAGES = 64;
/** The maximum portion of the buffer pool that can be used for the
read-ahead buffer. (Divide buf_pool size by this amount) */
static const ulint BUF_READ_AHEAD_PORTION = 32;
/** The buffer pools of the database */
buf_pool_t* buf_pool_ptr;
/** true when resizing buffer pool is in the critical path. */
volatile bool buf_pool_resizing;
/** Map of buffer pool chunks by its first frame address
This is newly made by initialization of buffer pool and buf_resize_thread.
Currently, no need mutex protection for update. */
typedef std::map<
const byte*,
buf_chunk_t*,
std::less<const byte*>,
ut_allocator<std::pair<const byte* const, buf_chunk_t*> > >
buf_pool_chunk_map_t;
static buf_pool_chunk_map_t* buf_chunk_map_reg;
/** Chunk map to be used to lookup.
The map pointed by this should not be updated */
static buf_pool_chunk_map_t* buf_chunk_map_ref = NULL;
#ifdef UNIV_DEBUG
/** Disable resizing buffer pool to make assertion code not expensive. */
my_bool buf_disable_resize_buffer_pool_debug = TRUE;
#endif /* UNIV_DEBUG */
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/** This is used to insert validation operations in execution
in the debug version */
static ulint buf_dbg_counter = 0;
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
/* Buffer block mutexes and rwlocks can be registered
in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
is defined, register buffer block mutex and rwlock
in one group after their initialization. */
# define PFS_GROUP_BUFFER_SYNC
/* This define caps the number of mutexes/rwlocks can
be registered with performance schema. Developers can
modify this define if necessary. Please note, this would
be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
# define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
/** Macro to determine whether the read of write counter is used depending
on the io_type */
#define MONITOR_RW_COUNTER(io_type, counter) \
((io_type == BUF_IO_READ) \
? (counter##_READ) \
: (counter##_WRITTEN))
/** Registers a chunk to buf_pool_chunk_map
@param[in] chunk chunk of buffers */
static
void
buf_pool_register_chunk(
buf_chunk_t* chunk)
{
buf_chunk_map_reg->insert(buf_pool_chunk_map_t::value_type(
chunk->blocks->frame, chunk));
}
/********************************************************************//**
Gets the smallest oldest_modification lsn for any page in the pool. Returns
zero if all modified pages have been flushed to disk.
@return oldest modification in pool, zero if none */
lsn_t
buf_pool_get_oldest_modification(void)
/*==================================*/
{
lsn_t lsn = 0;
lsn_t oldest_lsn = 0;
/* When we traverse all the flush lists we don't want another
thread to add a dirty page to any flush list. */
log_flush_order_mutex_enter();
for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
buf_flush_list_mutex_enter(buf_pool);
buf_page_t* bpage;
/* We don't let log-checkpoint halt because pages from system
temporary are not yet flushed to the disk. Anyway, object
residing in system temporary doesn't generate REDO logging. */
bpage = buf_pool->oldest_hp.get();
if (bpage != NULL) {
ut_ad(bpage->in_flush_list);
} else {
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
}
for (; bpage != NULL
&& fsp_is_system_temporary(bpage->id.space());
bpage = UT_LIST_GET_PREV(list, bpage)) {
/* Do nothing. */
}
if (bpage != NULL) {
ut_ad(bpage->in_flush_list);
lsn = bpage->oldest_modification;
buf_pool->oldest_hp.set(bpage);
} else {
/* The last scanned page as entry point,
or nullptr. */
buf_pool->oldest_hp.set(
UT_LIST_GET_FIRST(buf_pool->flush_list));
}
buf_flush_list_mutex_exit(buf_pool);
if (!oldest_lsn || oldest_lsn > lsn) {
oldest_lsn = lsn;
}
}
log_flush_order_mutex_exit();
/* The returned answer may be out of date: the flush_list can
change after the mutex has been released. */
return(oldest_lsn);
}
/********************************************************************//**
Get total buffer pool statistics. */
void
buf_get_total_list_len(
/*===================*/
ulint* LRU_len, /*!< out: length of all LRU lists */
ulint* free_len, /*!< out: length of all free lists */
ulint* flush_list_len) /*!< out: length of all flush lists */
{
ulint i;
*LRU_len = 0;
*free_len = 0;
*flush_list_len = 0;
for (i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
*free_len += UT_LIST_GET_LEN(buf_pool->free);
*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
}
}
/********************************************************************//**
Get total list size in bytes from all buffer pools. */
void
buf_get_total_list_size_in_bytes(
/*=============================*/
buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes
in all buffer pools */
{
ut_ad(buf_pools_list_size);
memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
/* We don't need mutex protection since this is
for statistics purpose */
buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
buf_pools_list_size->unzip_LRU_bytes +=
UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE;
buf_pools_list_size->flush_list_bytes +=
buf_pool->stat.flush_list_bytes;
}
}
/********************************************************************//**
Get total buffer pool statistics. */
void
buf_get_total_stat(
/*===============*/
buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */
{
ulint i;
memset(tot_stat, 0, sizeof(*tot_stat));
for (i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_stat_t*buf_stat;
buf_pool_t* buf_pool;
buf_pool = buf_pool_from_array(i);
buf_stat = &buf_pool->stat;
tot_stat->n_page_gets += buf_stat->n_page_gets;
tot_stat->n_pages_read += buf_stat->n_pages_read;
tot_stat->n_pages_written += buf_stat->n_pages_written;
tot_stat->n_pages_created += buf_stat->n_pages_created;
tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
tot_stat->n_pages_not_made_young +=
buf_stat->n_pages_not_made_young;
}
}
/********************************************************************//**
Allocates a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
buf_block_t*
buf_block_alloc(
/*============*/
buf_pool_t* buf_pool) /*!< in/out: buffer pool instance,
or NULL for round-robin selection
of the buffer pool */
{
buf_block_t* block;
ulint index;
static ulint buf_pool_index;
if (buf_pool == NULL) {
/* We are allocating memory from any buffer pool, ensure
we spread the grace on all buffer pool instances. */
index = buf_pool_index++ % srv_buf_pool_instances;
buf_pool = buf_pool_from_array(index);
}
block = buf_LRU_get_free_block(buf_pool);
buf_block_set_state(block, BUF_BLOCK_MEMORY);
return(block);
}
#endif /* !UNIV_HOTBACKUP && !UNIV_INNOCHECKSUM */
/** Checks if a page contains only zeroes.
@param[in] read_buf database page
@param[in] page_size page size
@return true if page is filled with zeroes */
bool
buf_page_is_zeroes(
const byte* read_buf,
const page_size_t& page_size)
{
for (ulint i = 0; i < page_size.logical(); i++) {
if (read_buf[i] != 0) {
return(false);
}
}
return(true);
}
/** Checks if the page is in crc32 checksum format.
@param[in] read_buf database page
@param[in] checksum_field1 new checksum field
@param[in] checksum_field2 old checksum field
@param[in] page_no page number of given read_buf
@param[in] is_log_enabled true if log option is enabled
@param[in] log_file file pointer to log_file
@param[in] curr_algo current checksum algorithm
@param[in] use_legacy_big_endian use legacy big endian algorithm
@return true if the page is in crc32 checksum format. */
UNIV_INLINE
bool
buf_page_is_checksum_valid_crc32(
const byte* read_buf,
ulint checksum_field1,
ulint checksum_field2,
#ifdef UNIV_INNOCHECKSUM
uintmax_t page_no,
bool is_log_enabled,
FILE* log_file,
const srv_checksum_algorithm_t curr_algo,
#endif /* UNIV_INNOCHECKSUM */
bool use_legacy_big_endian)
{
const uint32_t crc32 = buf_calc_page_crc32(read_buf,
use_legacy_big_endian);
#ifdef UNIV_INNOCHECKSUM
if (is_log_enabled
&& curr_algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
fprintf(log_file, "page::%" PRIuMAX ";"
" crc32 calculated = %u;"
" recorded checksum field1 = " ULINTPF
" recorded checksum field2 = " ULINTPF
"\n", page_no, crc32,
checksum_field1, checksum_field2);
}
#endif /* UNIV_INNOCHECKSUM */
if (checksum_field1 != checksum_field2) {
return(false);
}
if (checksum_field1 == crc32) {
return(true);
}
return(false);
}
/** Checks if the page is in innodb checksum format.
@param[in] read_buf database page
@param[in] checksum_field1 new checksum field
@param[in] checksum_field2 old checksum field
@param[in] page_no page number of given read_buf
@param[in] is_log_enabled true if log option is enabled
@param[in] log_file file pointer to log_file
@param[in] curr_algo current checksum algorithm
@return true if the page is in innodb checksum format. */
UNIV_INLINE
bool
buf_page_is_checksum_valid_innodb(
const byte* read_buf,
ulint checksum_field1,
ulint checksum_field2
#ifdef UNIV_INNOCHECKSUM
,uintmax_t page_no,
bool is_log_enabled,
FILE* log_file,
const srv_checksum_algorithm_t curr_algo
#endif /* UNIV_INNOCHECKSUM */
)
{
/* There are 2 valid formulas for
checksum_field2 (old checksum field) which algo=innodb could have
written to the page:
1. Very old versions of InnoDB only stored 8 byte lsn to the
start and the end of the page.
2. Newer InnoDB versions store the old formula checksum
(buf_calc_page_old_checksum()). */
ulint old_checksum = buf_calc_page_old_checksum(read_buf);
ulint new_checksum = buf_calc_page_new_checksum(read_buf);
#ifdef UNIV_INNOCHECKSUM
if (is_log_enabled
&& curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB) {
fprintf(log_file, "page::%" PRIuMAX ";"
" old style: calculated = " ULINTPF
"; recorded = " ULINTPF "\n",
page_no, old_checksum,
checksum_field2);
fprintf(log_file, "page::%" PRIuMAX ";"
" new style: calculated = " ULINTPF
"; crc32 = %u; recorded = " ULINTPF "\n",
page_no, new_checksum,
buf_calc_page_crc32(read_buf), checksum_field1);
}
if (is_log_enabled
&& curr_algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
fprintf(log_file, "page::%" PRIuMAX ";"
" old style: calculated = " ULINTPF
"; recorded checksum = " ULINTPF "\n",
page_no, old_checksum,
checksum_field2);
fprintf(log_file, "page::%" PRIuMAX ";"
" new style: calculated = " ULINTPF
"; recorded checksum = " ULINTPF "\n",
page_no, new_checksum,
checksum_field1);
}
#endif /* UNIV_INNOCHECKSUM */
if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
&& checksum_field2 != old_checksum) {
return(false);
}
/* old field is fine, check the new field */
/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
return(false);
}
return(true);
}
/** Checks if the page is in none checksum format.
@param[in] read_buf database page
@param[in] checksum_field1 new checksum field
@param[in] checksum_field2 old checksum field
@param[in] page_no page number of given read_buf
@param[in] is_log_enabled true if log option is enabled
@param[in] log_file file pointer to log_file
@param[in] curr_algo current checksum algorithm
@return true if the page is in none checksum format. */
UNIV_INLINE
bool
buf_page_is_checksum_valid_none(
const byte* read_buf,
ulint checksum_field1,
ulint checksum_field2
#ifdef UNIV_INNOCHECKSUM
,uintmax_t page_no,
bool is_log_enabled,
FILE* log_file,
const srv_checksum_algorithm_t curr_algo
#endif /* UNIV_INNOCHECKSUM */
)
{
#ifdef UNIV_INNOCHECKSUM
if (is_log_enabled
&& curr_algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
fprintf(log_file,
"page::%" PRIuMAX
"; none checksum: calculated = %lu;"
" recorded checksum_field1 = " ULINTPF
" recorded checksum_field2 = " ULINTPF "\n",
page_no, BUF_NO_CHECKSUM_MAGIC,
checksum_field1, checksum_field2);
}
#endif /* UNIV_INNOCHECKSUM */
return(checksum_field1 == checksum_field2
&& checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
}
/** Checks if a page is corrupt.
@param[in] check_lsn true if we need to check and complain about
the LSN
@param[in] read_buf database page
@param[in] page_size page size
@param[in] skip_checksum if true, skip checksum
@param[in] page_no page number of given read_buf
@param[in] strict_check true if strict-check option is enabled
@param[in] is_log_enabled true if log option is enabled
@param[in] log_file file pointer to log_file
@return TRUE if corrupted */
ibool
buf_page_is_corrupted(
bool check_lsn,
const byte* read_buf,
const page_size_t& page_size,
bool skip_checksum
#ifdef UNIV_INNOCHECKSUM
,uintmax_t page_no,
bool strict_check,
bool is_log_enabled,
FILE* log_file
#endif /* UNIV_INNOCHECKSUM */
)
{
ulint checksum_field1;
ulint checksum_field2;
if (!page_size.is_compressed()
&& memcmp(read_buf + FIL_PAGE_LSN + 4,
read_buf + page_size.logical()
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
/* Stored log sequence numbers at the start and the end
of page do not match */
return(TRUE);
}
#if !defined(UNIV_HOTBACKUP) && !defined(UNIV_INNOCHECKSUM)
if (check_lsn && recv_lsn_checks_on) {
lsn_t current_lsn;
const lsn_t page_lsn
= mach_read_from_8(read_buf + FIL_PAGE_LSN);
/* Since we are going to reset the page LSN during the import
phase it makes no sense to spam the log with error messages. */
if (log_peek_lsn(¤t_lsn) && current_lsn < page_lsn) {
const ulint space_id = mach_read_from_4(
read_buf + FIL_PAGE_SPACE_ID);
const ulint page_no = mach_read_from_4(
read_buf + FIL_PAGE_OFFSET);
ib::error() << "Page " << page_id_t(space_id, page_no)
<< " log sequence number " << page_lsn
<< " is in the future! Current system"
<< " log sequence number "
<< current_lsn << ".";
ib::error() << "Your database may be corrupt or"
" you may have copied the InnoDB"
" tablespace but not the InnoDB"
" log files. "
<< FORCE_RECOVERY_MSG;
}
}
#endif /* !UNIV_HOTBACKUP && !UNIV_INNOCHECKSUM */
/* Check whether the checksum fields have correct values */
if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE
|| skip_checksum) {
return(FALSE);
}
if (page_size.is_compressed()) {
#ifdef UNIV_INNOCHECKSUM
return(!page_zip_verify_checksum(read_buf,
page_size.physical(),
page_no, strict_check,
is_log_enabled, log_file));
#else
return(!page_zip_verify_checksum(read_buf,
page_size.physical()));
#endif /* UNIV_INNOCHECKSUM */
}
checksum_field1 = mach_read_from_4(
read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
checksum_field2 = mach_read_from_4(
read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM);
#if FIL_PAGE_LSN % 8
#error "FIL_PAGE_LSN must be 64 bit aligned"
#endif
/* declare empty pages non-corrupted */
if (checksum_field1 == 0
&& checksum_field2 == 0
&& *reinterpret_cast<const ib_uint64_t*>(
read_buf + FIL_PAGE_LSN) == 0) {
/* make sure that the page is really empty */
ulint i;
for (i = 0; i < page_size.logical(); ++i) {
/* The FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID has been
repurposed for page compression. It can be
set for uncompressed empty pages. */
if ((i < FIL_PAGE_FILE_FLUSH_LSN
|| i >= FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)
&& read_buf[i] != 0) {
break;
}
}
#ifdef UNIV_INNOCHECKSUM
if (i >= page_size.logical()) {
if (is_log_enabled) {
fprintf(log_file, "Page::%" PRIuMAX
" is empty and uncorrupted\n",
page_no);
}
return(FALSE);
}
#else
return(i < page_size.logical());
#endif /* UNIV_INNOCHECKSUM */
}
#ifndef UNIV_INNOCHECKSUM
const page_id_t page_id(mach_read_from_4(
read_buf + FIL_PAGE_SPACE_ID),
mach_read_from_4(
read_buf + FIL_PAGE_OFFSET));
#endif /* UNIV_INNOCHECKSUM */
DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(TRUE); );
const srv_checksum_algorithm_t curr_algo =
static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
bool legacy_checksum_checked = false;
switch (curr_algo) {
case SRV_CHECKSUM_ALGORITHM_CRC32:
case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
if (buf_page_is_checksum_valid_crc32(read_buf,
checksum_field1, checksum_field2,
#ifdef UNIV_INNOCHECKSUM
page_no, is_log_enabled, log_file, curr_algo,
#endif /* UNIV_INNOCHECKSUM */
false)) {
return(FALSE);
}
if (buf_page_is_checksum_valid_none(read_buf,
checksum_field1, checksum_field2
#ifdef UNIV_INNOCHECKSUM
, page_no, is_log_enabled, log_file, curr_algo)) {
#else /* UNIV_INNOCHECKSUM */
)) {
if (curr_algo
== SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
page_warn_strict_checksum(
curr_algo,
SRV_CHECKSUM_ALGORITHM_NONE,
page_id);
}
#endif /* UNIV_INNOCHECKSUM */
#ifdef UNIV_INNOCHECKSUM
if (is_log_enabled) {
fprintf(log_file, "page::%" PRIuMAX ";"
" old style: calculated = " ULINTPF ";"
" recorded = " ULINTPF "\n", page_no,
buf_calc_page_old_checksum(read_buf),
checksum_field2);
fprintf(log_file, "page::%" PRIuMAX ";"
" new style: calculated = " ULINTPF ";"
" crc32 = %u; recorded = " ULINTPF "\n",
page_no,
buf_calc_page_new_checksum(read_buf),
buf_calc_page_crc32(read_buf),
checksum_field1);
}
#endif /* UNIV_INNOCHECKSUM */
return(FALSE);
}
/* We need to check whether the stored checksum matches legacy
big endian checksum or Innodb checksum. We optimize the order
based on earlier results. if earlier we have found pages
matching legacy big endian checksum, we try to match it first.
Otherwise we check innodb checksum first. */
if (legacy_big_endian_checksum) {
if (buf_page_is_checksum_valid_crc32(read_buf,
checksum_field1, checksum_field2,
#ifdef UNIV_INNOCHECKSUM
page_no, is_log_enabled, log_file, curr_algo,
#endif /* UNIV_INNOCHECKSUM */
true)) {
return(FALSE);
}
legacy_checksum_checked = true;
}
if (buf_page_is_checksum_valid_innodb(read_buf,
checksum_field1, checksum_field2
#ifdef UNIV_INNOCHECKSUM
, page_no, is_log_enabled, log_file, curr_algo)) {
#else /* UNIV_INNOCHECKSUM */
)) {
if (curr_algo
== SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
page_warn_strict_checksum(
curr_algo,
SRV_CHECKSUM_ALGORITHM_INNODB,
page_id);
}
#endif /* UNIV_INNOCHECKSUM */
return(FALSE);
}