Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 :
4 : trivial database library
5 :
6 : Copyright (C) Rusty Russell 2009
7 :
8 : ** NOTE! The following LGPL license applies to the tdb
9 : ** library. This does NOT imply that all of Samba is released
10 : ** under the LGPL
11 :
12 : This library is free software; you can redistribute it and/or
13 : modify it under the terms of the GNU Lesser General Public
14 : License as published by the Free Software Foundation; either
15 : version 3 of the License, or (at your option) any later version.
16 :
17 : This library is distributed in the hope that it will be useful,
18 : but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 : Lesser General Public License for more details.
21 :
22 : You should have received a copy of the GNU Lesser General Public
23 : License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 : */
25 : #include "tdb_private.h"
26 :
27 : /* Since we opened it, these shouldn't fail unless it's recent corruption. */
28 65662 : static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
29 : {
30 1 : struct tdb_header hdr;
31 1 : uint32_t h1, h2;
32 :
33 65662 : if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
34 0 : return false;
35 65662 : if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
36 160 : goto corrupt;
37 :
38 65502 : CONVERT(hdr);
39 65502 : if (hdr.version != TDB_VERSION)
40 64 : goto corrupt;
41 :
42 65438 : if (hdr.rwlocks != 0 &&
43 152 : hdr.rwlocks != TDB_FEATURE_FLAG_MAGIC &&
44 152 : hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
45 64 : goto corrupt;
46 :
47 65374 : tdb_header_hash(tdb, &h1, &h2);
48 65374 : if (hdr.magic1_hash && hdr.magic2_hash &&
49 65367 : (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
50 128 : goto corrupt;
51 :
52 65246 : if (hdr.hash_size == 0)
53 2 : goto corrupt;
54 :
55 65244 : if (hdr.hash_size != tdb->hash_size)
56 62 : goto corrupt;
57 :
58 65182 : if (hdr.recovery_start != 0 &&
59 51 : hdr.recovery_start < TDB_DATA_START(tdb->hash_size))
60 16 : goto corrupt;
61 :
62 65166 : *recovery = hdr.recovery_start;
63 65166 : return true;
64 :
65 496 : corrupt:
66 496 : tdb->ecode = TDB_ERR_CORRUPT;
67 496 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
68 496 : return false;
69 : }
70 :
71 : /* Generic record header check. */
72 385563 : static bool tdb_check_record(struct tdb_context *tdb,
73 : tdb_off_t off,
74 : const struct tdb_record *rec)
75 : {
76 2 : tdb_off_t tailer;
77 :
78 : /* Check rec->next: 0 or points to record offset, aligned. */
79 385563 : if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->hash_size)){
80 48 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
81 : "Record offset %u too small next %u\n",
82 : off, rec->next));
83 48 : goto corrupt;
84 : }
85 2 : if (rec->next + sizeof(*rec) < rec->next) {
86 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
87 : "Record offset %u too large next %u\n",
88 : off, rec->next));
89 : goto corrupt;
90 : }
91 385515 : if ((rec->next % TDB_ALIGNMENT) != 0) {
92 12 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
93 : "Record offset %u misaligned next %u\n",
94 : off, rec->next));
95 12 : goto corrupt;
96 : }
97 385503 : if (tdb_oob(tdb, rec->next, sizeof(*rec), 0))
98 244 : goto corrupt;
99 :
100 : /* Check rec_len: similar to rec->next, implies next record. */
101 385259 : if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
102 24 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
103 : "Record offset %u misaligned length %u\n",
104 : off, rec->rec_len));
105 24 : goto corrupt;
106 : }
107 : /* Must fit tailer. */
108 385235 : if (rec->rec_len < sizeof(tailer)) {
109 6 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
110 : "Record offset %u too short length %u\n",
111 : off, rec->rec_len));
112 6 : goto corrupt;
113 : }
114 : /* OOB allows "right at the end" access, so this works for last rec. */
115 385229 : if (tdb_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0))
116 298 : goto corrupt;
117 :
118 : /* Check tailer. */
119 384931 : if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
120 : &tailer) == -1)
121 0 : goto corrupt;
122 384931 : if (tailer != sizeof(*rec) + rec->rec_len) {
123 440 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
124 : "Record offset %u invalid tailer\n", off));
125 440 : goto corrupt;
126 : }
127 :
128 384489 : return true;
129 :
130 1072 : corrupt:
131 1072 : tdb->ecode = TDB_ERR_CORRUPT;
132 1072 : return false;
133 : }
134 :
135 : /* Grab some bytes: may copy if can't use mmap.
136 : Caller has already done bounds check. */
137 634186 : static TDB_DATA get_bytes(struct tdb_context *tdb,
138 : tdb_off_t off, tdb_len_t len)
139 : {
140 1 : TDB_DATA d;
141 :
142 634186 : d.dsize = len;
143 :
144 634186 : if (tdb->transaction == NULL && tdb->map_ptr != NULL)
145 318233 : d.dptr = (unsigned char *)tdb->map_ptr + off;
146 : else
147 315953 : d.dptr = tdb_alloc_read(tdb, off, d.dsize);
148 634186 : return d;
149 : }
150 :
151 : /* Frees data if we're not able to simply use mmap. */
152 634186 : static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
153 : {
154 634186 : if (tdb->transaction == NULL && tdb->map_ptr != NULL)
155 318232 : return;
156 315953 : free(d.dptr);
157 : }
158 :
159 : /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
160 : * See: http://burtleburtle.net/bob/c/lookup3.c
161 : */
162 : #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
163 3089264 : static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
164 : {
165 16 : uint32_t a,b,c;
166 :
167 : /* Set up the internal state */
168 3089264 : a = b = c = 0xdeadbeef + *pc;
169 3089264 : c += *pb;
170 3089264 : a += key;
171 3089264 : c ^= b; c -= rot(b,14);
172 3089264 : a ^= c; a -= rot(c,11);
173 3089264 : b ^= a; b -= rot(a,25);
174 3089264 : c ^= b; c -= rot(b,16);
175 3089264 : a ^= c; a -= rot(c,4);
176 3089264 : b ^= a; b -= rot(a,14);
177 3089264 : c ^= b; c -= rot(b,24);
178 3089264 : *pc=c; *pb=b;
179 3089264 : }
180 :
181 : /*
182 : We want to check that all free records are in the free list
183 : (only once), and all free list entries are free records. Similarly
184 : for each hash chain of used records.
185 :
186 : Doing that naively (without walking hash chains, since we want to be
187 : linear) means keeping a list of records which have been seen in each
188 : hash chain, and another of records pointed to (ie. next pointers
189 : from records and the initial hash chain heads). These two lists
190 : should be equal. This will take 8 bytes per record, and require
191 : sorting at the end.
192 :
193 : So instead, we record each offset in a bitmap such a way that
194 : recording it twice will cancel out. Since each offset should appear
195 : exactly twice, the bitmap should be zero at the end.
196 :
197 : The approach was inspired by Bloom Filters (see Wikipedia). For
198 : each value, we flip K bits in a bitmap of size N. The number of
199 : distinct arrangements is:
200 :
201 : N! / (K! * (N-K)!)
202 :
203 : Of course, not all arrangements are actually distinct, but testing
204 : shows this formula to be close enough.
205 :
206 : So, if K == 8 and N == 256, the probability of two things flipping the same
207 : bits is 1 in 409,663,695,276,000.
208 :
209 : Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
210 : (320k) seems reasonable.
211 : */
212 : #define NUM_HASHES 8
213 : #define BITMAP_BITS 256
214 :
215 6178512 : static void bit_flip(unsigned char bits[], unsigned int idx)
216 : {
217 6178512 : bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
218 6178496 : }
219 :
220 : /* We record offsets in a bitmap for the particular chain it should be in. */
221 772316 : static void record_offset(unsigned char bits[], tdb_off_t off)
222 : {
223 772316 : uint32_t h1 = off, h2 = 0;
224 4 : unsigned int i;
225 :
226 : /* We get two good hash values out of jhash2, so we use both. Then
227 : * we keep going to produce further hash values. */
228 3861580 : for (i = 0; i < NUM_HASHES / 2; i++) {
229 3089264 : hash(off, &h1, &h2);
230 3089264 : bit_flip(bits, h1 % BITMAP_BITS);
231 3089264 : bit_flip(bits, h2 % BITMAP_BITS);
232 3089264 : h2++;
233 : }
234 772316 : }
235 :
236 : /* Check that an in-use record is valid. */
237 320022 : static bool tdb_check_used_record(struct tdb_context *tdb,
238 : tdb_off_t off,
239 : const struct tdb_record *rec,
240 : unsigned char **hashes,
241 : int (*check)(TDB_DATA, TDB_DATA, void *),
242 : void *private_data)
243 : {
244 1 : TDB_DATA key, data;
245 1 : tdb_len_t len;
246 :
247 320022 : if (!tdb_check_record(tdb, off, rec))
248 888 : return false;
249 :
250 : /* key + data + tailer must fit in record */
251 319134 : len = rec->key_len;
252 319134 : len += rec->data_len;
253 319134 : if (len < rec->data_len) {
254 : /* overflow */
255 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n"));
256 0 : return false;
257 : }
258 319134 : len += sizeof(tdb_off_t);
259 319134 : if (len < sizeof(tdb_off_t)) {
260 : /* overflow */
261 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "Record lengths overflow\n"));
262 0 : return false;
263 : }
264 :
265 319134 : if (len > rec->rec_len) {
266 586 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
267 : "Record offset %u too short for contents\n", off));
268 586 : return false;
269 : }
270 :
271 318548 : key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
272 318548 : if (!key.dptr)
273 0 : return false;
274 :
275 318548 : if (tdb->hash_fn(&key) != rec->full_hash) {
276 586 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
277 : "Record offset %u has incorrect hash\n", off));
278 586 : goto fail_put_key;
279 : }
280 :
281 : /* Mark this offset as a known value for this hash bucket. */
282 317962 : record_offset(hashes[BUCKET(rec->full_hash)+1], off);
283 : /* And similarly if the next pointer is valid. */
284 317962 : if (rec->next)
285 192808 : record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
286 :
287 : /* If they supply a check function and this record isn't dead,
288 : get data and feed it. */
289 317962 : if (check && rec->magic != TDB_DEAD_MAGIC) {
290 315638 : data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
291 315638 : rec->data_len);
292 315638 : if (!data.dptr)
293 0 : goto fail_put_key;
294 :
295 315638 : if (check(key, data, private_data) == -1)
296 428 : goto fail_put_data;
297 315210 : put_bytes(tdb, data);
298 : }
299 :
300 317534 : put_bytes(tdb, key);
301 317534 : return true;
302 :
303 428 : fail_put_data:
304 428 : put_bytes(tdb, data);
305 1014 : fail_put_key:
306 1014 : put_bytes(tdb, key);
307 1014 : return false;
308 : }
309 :
310 : /* Check that an unused record is valid. */
311 65541 : static bool tdb_check_free_record(struct tdb_context *tdb,
312 : tdb_off_t off,
313 : const struct tdb_record *rec,
314 : unsigned char **hashes)
315 : {
316 65541 : if (!tdb_check_record(tdb, off, rec))
317 184 : return false;
318 :
319 : /* Mark this offset as a known value for the free list. */
320 65357 : record_offset(hashes[0], off);
321 : /* And similarly if the next pointer is valid. */
322 65357 : if (rec->next)
323 510 : record_offset(hashes[0], rec->next);
324 65356 : return true;
325 : }
326 :
327 : /* Slow, but should be very rare. */
328 33 : size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off)
329 : {
330 0 : size_t len;
331 :
332 40685457 : for (len = 0; off + len < tdb->map_size; len++) {
333 0 : char c;
334 40685424 : if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
335 0 : return 0;
336 40685424 : if (c != 0 && c != 0x42)
337 0 : break;
338 : }
339 33 : return len;
340 : }
341 :
342 65702 : _PUBLIC_ int tdb_check(struct tdb_context *tdb,
343 : int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
344 : void *private_data)
345 : {
346 1 : unsigned int h;
347 1 : unsigned char **hashes;
348 1 : tdb_off_t off, recovery_start;
349 1 : struct tdb_record rec;
350 65702 : bool found_recovery = false;
351 1 : tdb_len_t dead;
352 1 : bool locked;
353 :
354 : /* Read-only databases use no locking at all: it's best-effort.
355 : * We may have a write lock already, so skip that case too. */
356 65702 : if (tdb->read_only || tdb->allrecord_lock.count != 0) {
357 26 : locked = false;
358 : } else {
359 65676 : if (tdb_lockall_read(tdb) == -1)
360 40 : return -1;
361 65635 : locked = true;
362 : }
363 :
364 : /* Make sure we know true size of the underlying file. */
365 65662 : tdb_oob(tdb, tdb->map_size, 1, 1);
366 :
367 : /* Header must be OK: also gets us the recovery ptr, if any. */
368 65662 : if (!tdb_check_header(tdb, &recovery_start))
369 496 : goto unlock;
370 :
371 : /* We should have the whole header, too. */
372 65166 : if (tdb->map_size < TDB_DATA_START(tdb->hash_size)) {
373 0 : tdb->ecode = TDB_ERR_CORRUPT;
374 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
375 0 : goto unlock;
376 : }
377 :
378 : /* One big malloc: pointers then bit arrays. */
379 65166 : hashes = (unsigned char **)calloc(
380 65166 : 1, sizeof(hashes[0]) * (1+tdb->hash_size)
381 65166 : + BITMAP_BITS / CHAR_BIT * (1+tdb->hash_size));
382 65166 : if (!hashes) {
383 0 : tdb->ecode = TDB_ERR_OOM;
384 0 : goto unlock;
385 : }
386 :
387 : /* Initialize pointers */
388 65166 : hashes[0] = (unsigned char *)(&hashes[1+tdb->hash_size]);
389 322527 : for (h = 1; h < 1+tdb->hash_size; h++)
390 257361 : hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
391 :
392 : /* Freelist and hash headers are all in a row: read them. */
393 387693 : for (h = 0; h < 1+tdb->hash_size; h++) {
394 322527 : if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
395 : &off) == -1)
396 0 : goto free;
397 322527 : if (off)
398 195679 : record_offset(hashes[h], off);
399 : }
400 :
401 : /* For each record, read it in and check it's ok. */
402 65166 : for (off = TDB_DATA_START(tdb->hash_size);
403 448117 : off < tdb->map_size;
404 382951 : off += sizeof(rec) + rec.rec_len) {
405 386007 : if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
406 386007 : DOCONV()) == -1)
407 0 : goto free;
408 386007 : switch (rec.magic) {
409 320022 : case TDB_MAGIC:
410 : case TDB_DEAD_MAGIC:
411 320022 : if (!tdb_check_used_record(tdb, off, &rec, hashes,
412 : check, private_data))
413 2488 : goto free;
414 317533 : break;
415 65541 : case TDB_FREE_MAGIC:
416 65541 : if (!tdb_check_free_record(tdb, off, &rec, hashes))
417 184 : goto free;
418 65356 : break;
419 : /* If we crash after ftruncate, we can get zeroes or fill. */
420 60 : case TDB_RECOVERY_INVALID_MAGIC:
421 : case 0x42424242:
422 60 : if (recovery_start == off) {
423 27 : found_recovery = true;
424 27 : break;
425 : }
426 33 : dead = tdb_dead_space(tdb, off);
427 33 : if (dead < sizeof(rec))
428 0 : goto corrupt;
429 :
430 33 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
431 : "Dead space at %u-%u (of %u)\n",
432 : off, off + dead, tdb->map_size));
433 33 : rec.rec_len = dead - sizeof(rec);
434 33 : break;
435 0 : case TDB_RECOVERY_MAGIC:
436 0 : if (recovery_start != off) {
437 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
438 : "Unexpected recovery record at offset %u\n",
439 : off));
440 0 : goto free;
441 : }
442 0 : found_recovery = true;
443 0 : break;
444 0 : default: ;
445 384 : corrupt:
446 384 : tdb->ecode = TDB_ERR_CORRUPT;
447 384 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
448 : "Bad magic 0x%x at offset %u\n",
449 : rec.magic, off));
450 384 : goto free;
451 : }
452 : }
453 :
454 : /* Now, hashes should all be empty: each record exists and is referred
455 : * to by one other. */
456 374866 : for (h = 0; h < 1+tdb->hash_size; h++) {
457 : unsigned int i;
458 10321765 : for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
459 10009009 : if (hashes[h][i] != 0) {
460 273 : tdb->ecode = TDB_ERR_CORRUPT;
461 273 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
462 : "Hashes do not match records\n"));
463 273 : goto free;
464 : }
465 : }
466 : }
467 :
468 : /* We must have found recovery area if there was one. */
469 61837 : if (recovery_start != 0 && !found_recovery) {
470 8 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
471 : "Expected a recovery area at %u\n",
472 : recovery_start));
473 8 : goto free;
474 : }
475 :
476 61829 : free(hashes);
477 61829 : if (locked) {
478 61803 : tdb_unlockall_read(tdb);
479 : }
480 61828 : return 0;
481 :
482 3337 : free:
483 3337 : free(hashes);
484 3833 : unlock:
485 3833 : if (locked) {
486 3833 : tdb_unlockall_read(tdb);
487 : }
488 3833 : return -1;
489 : }
|