diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/db/mp_sync.c')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/db/mp_sync.c | 630 |
1 files changed, 630 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/db/mp_sync.c b/debian/htdig/htdig-3.2.0b6/db/mp_sync.c new file mode 100644 index 00000000..2ebc60c1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/db/mp_sync.c @@ -0,0 +1,630 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char sccsid[] = "@(#)mp_sync.c 11.10 (Sleepycat) 10/29/99"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +static int CDB___bhcmp __P((const void *, const void *)); +static int CDB___memp_fsync __P((DB_MPOOLFILE *)); +static int CDB___memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *)); + +/* + * CDB_memp_sync -- + * Mpool sync function. + */ +int +CDB_memp_sync(dbenv, lsnp) + DB_ENV *dbenv; + DB_LSN *lsnp; +{ + BH *bhp, **bharray; + DB_MPOOL *dbmp; + DB_LSN tlsn; + MCACHE *mc; + MPOOL *mp; + MPOOLFILE *mfp; + u_int32_t ar_cnt, i, ndirty; + int ret, retry_done, retry_need, wrote; + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo.primary; + + if (!F_ISSET(dbenv, DB_ENV_LOGGING)) { + CDB___db_err(dbenv, "CDB_memp_sync: requires logging"); + return (EINVAL); + } + + /* + * If no LSN is provided, flush the entire cache. + * + * !!! + * Our current behavior is to flush the entire cache, so there's + * nothing special we have to do here other than deal with NULL + * pointers. + */ + if (lsnp == NULL) { + ZERO_LSN(tlsn); + lsnp = &tlsn; + F_SET(mp, MP_LSN_RETRY); + } + + /* + * Sync calls are single-threaded so that we don't have multiple + * threads, with different checkpoint LSNs, walking the caches + * and updating the checkpoint LSNs and how many buffers remain + * to be written for the checkpoint. This shouldn't be a problem, + * any application that has multiple checkpoint threads isn't what + * I'd call trustworthy. + */ + MUTEX_LOCK(&mp->sync_mutex, dbenv->lockfhp); + + /* + * If the application is asking about a previous call to CDB_memp_sync(), + * and we haven't found any buffers that the application holding the + * pin couldn't write, return yes or no based on the current count. + * Note, if the application is asking about a LSN *smaller* than one + * we've already handled or are currently handling, then we return a + * result based on the count for the larger LSN. + */ + R_LOCK(dbenv, &dbmp->reginfo); + if (!IS_ZERO_LSN(*lsnp) && + !F_ISSET(mp, MP_LSN_RETRY) && CDB_log_compare(lsnp, &mp->lsn) <= 0) { + if (mp->lsn_cnt == 0) { + *lsnp = mp->lsn; + ret = 0; + } else + ret = DB_INCOMPLETE; + + R_UNLOCK(dbenv, &dbmp->reginfo); + MUTEX_UNLOCK(&mp->sync_mutex); + return (ret); + } + + /* + * Allocate room for a list of buffers, and decide how many buffers + * we can pin down. + * + * !!! + * Note: CDB___memp_sballoc has released the region lock if we're not + * continuing forward. + */ + if ((ret = + CDB___memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) { + MUTEX_UNLOCK(&mp->sync_mutex); + return (ret); + } + + retry_done = 0; +retry: retry_need = 0; + /* + * Start a new checkpoint. + * + * Save the LSN. We know that it's a new LSN, a retry, or larger than + * the one for which we were already doing a checkpoint. (BTW, I don't + * expect to see multiple LSN's from the same or multiple processes, + * but You Just Never Know. Responding as if they all called with the + * largest of the LSNs specified makes everything work.) + * + * We don't currently use the LSN we save. We could potentially save + * the last-written LSN in each buffer header and use it to determine + * what buffers need to be written. The problem with this is that it's + * sizeof(LSN) more bytes of buffer header. We currently write all the + * dirty buffers instead, but with a sufficiently large cache that's + * going to be a problem. + */ + mp->lsn = *lsnp; + + /* + * Clear the global count of buffers waiting to be written, walk the + * list of files clearing the count of buffers waiting to be written. + * + * Clear the retry flag. + */ + mp->lsn_cnt = 0; + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + mfp->lsn_cnt = 0; + F_CLR(mp, MP_LSN_RETRY); + + /* + * Walk each cache's list of buffers and mark all dirty buffers to be + * written and all pinned buffers to be potentially written (we can't + * know if they'll need to be written until the holder returns them to + * the cache). We do this in one pass while holding the region locked + * so that processes can't make new buffers dirty, causing us to never + * finish. Since the application may have restarted the sync using a + * different LSN value, clear any BH_WRITE flags that appear leftover + * from previous calls. + * + * Keep a count of the total number of buffers we need to write in + * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. + */ + for (ar_cnt = 0, i = 0; i < mp->nc_reg; ++i) { + mc = dbmp->c_reginfo[i].primary; + + for (bhp = SH_TAILQ_FIRST(&mc->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { + F_SET(bhp, BH_WRITE); + + ++mp->lsn_cnt; + + mfp = R_ADDR(&dbmp->reginfo, bhp->mf_offset); + ++mfp->lsn_cnt; + + /* + * If the buffer isn't being used, we can write + * it immediately, so increment its reference + * count to lock it down, and save a reference + * to it. + * + * If we've run out space to store buffer refs, + * we're screwed. We don't want to realloc the + * array while holding a region lock, so we set + * a flag and deal with it later. + */ + if (bhp->ref == 0) { + ++bhp->ref; + bharray[ar_cnt] = bhp; + + if (++ar_cnt >= ndirty) { + retry_need = 1; + break; + } + } + } else + if (F_ISSET(bhp, BH_WRITE)) + F_CLR(bhp, BH_WRITE); + } + if (ar_cnt >= ndirty) + break; + } + + /* If there no buffers we can write immediately, we're done. */ + if (ar_cnt == 0) { + ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + goto done; + } + + R_UNLOCK(dbenv, &dbmp->reginfo); + + /* + * Sort the buffers we're going to write immediately. + * + * We try and write the buffers in file/page order: it should reduce + * seeks by the underlying filesystem and possibly reduce the actual + * number of writes. + */ + if (ar_cnt > 1) + qsort(bharray, ar_cnt, sizeof(BH *), CDB___bhcmp); + + R_LOCK(dbenv, &dbmp->reginfo); + + /* Walk the array, writing buffers. */ + for (i = 0; i < ar_cnt; ++i) { + /* + * It's possible for a thread to have gotten the buffer since + * we listed it for writing. If the reference count is still + * 1, we're the only ones using the buffer, go ahead and write. + * If it's >1, then skip the buffer and assume that it will be + * written when it's returned to the cache. + */ + if (bharray[i]->ref > 1) { + --bharray[i]->ref; + continue; + } + + /* Write the buffer. */ + mfp = R_ADDR(&dbmp->reginfo, bharray[i]->mf_offset); + ret = CDB___memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote); + + /* Release the buffer. */ + --bharray[i]->ref; + + if (ret == 0 && wrote) + continue; + + /* + * Any process syncing the shared memory buffer pool had best + * be able to write to any underlying file. Be understanding, + * but firm, on this point. + */ + if (ret == 0) { + CDB___db_err(dbenv, "%s: unable to flush page: %lu", + CDB___memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno); + ret = EPERM; + } + + /* + * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no + * future checkpoint return can depend on this failure. Don't + * bother to reset/clear: + * + * MPOOL->lsn_cnt + * MPOOLFILE->lsn_cnt + * buffer BH_WRITE flags + * + * they don't make any difference. + */ + ZERO_LSN(mp->lsn); + F_SET(mp, MP_LSN_RETRY); + + /* Release any buffers we're still pinning down. */ + while (++i < ar_cnt) + --bharray[i]->ref; + + goto done; + } + + ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0; + + /* + * If there were too many buffers and we're not returning an error, we + * re-try the checkpoint once -- since we allocated 80% of the total + * buffer count, once should be enough. If it still doesn't work, some + * other thread of control is dirtying buffers as fast as we're writing + * them, and we might as well give up for now. In the latter case, set + * the global retry flag, we'll have to start from scratch on the next + * checkpoint. + */ + if (retry_need) { + if (retry_done) { + ret = DB_INCOMPLETE; + F_SET(mp, MP_LSN_RETRY); + } else { + retry_done = 1; + goto retry; + } + } + +done: R_UNLOCK(dbenv, &dbmp->reginfo); + MUTEX_UNLOCK(&mp->sync_mutex); + + CDB___os_free(bharray, ndirty * sizeof(BH *)); + + return (ret); +} + +/* + * CDB_memp_fsync -- + * Mpool file sync function. + */ +int +CDB_memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + int is_tmp; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + + PANIC_CHECK(dbenv); + + /* + * If this handle doesn't have a file descriptor that's open for + * writing, or if the file is a temporary, there's no reason to + * proceed further. + */ + if (F_ISSET(dbmfp, MP_READONLY)) + return (0); + + R_LOCK(dbenv, &dbmp->reginfo); + is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP); + R_UNLOCK(dbenv, &dbmp->reginfo); + if (is_tmp) + return (0); + + return (CDB___memp_fsync(dbmfp)); +} + +/* + * CDB___mp_xxx_fh -- + * Return a file descriptor for DB 1.85 compatibility locking. + * + * PUBLIC: int CDB___mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); + */ +int +CDB___mp_xxx_fh(dbmfp, fhp) + DB_MPOOLFILE *dbmfp; + DB_FH **fhp; +{ + /* + * This is a truly spectacular layering violation, intended ONLY to + * support compatibility for the DB 1.85 DB->fd call. + * + * Sync the database file to disk, creating the file as necessary. + * + * We skip the MP_READONLY and MP_TEMP tests done by CDB_memp_fsync(3). + * The MP_READONLY test isn't interesting because we will either + * already have a file descriptor (we opened the database file for + * reading) or we aren't readonly (we created the database which + * requires write privileges). The MP_TEMP test isn't interesting + * because we want to write to the backing file regardless so that + * we get a file descriptor to return. + */ + *fhp = &dbmfp->fh; + return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : CDB___memp_fsync(dbmfp)); +} + +/* + * CDB___memp_fsync -- + * Mpool file internal sync function. + */ +static int +CDB___memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + BH *bhp, **bharray; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MCACHE *mc; + MPOOL *mp; + size_t mf_offset; + u_int32_t ar_cnt, i, ndirty; + int incomplete, ret, retry_done, retry_need, wrote; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mp = dbmp->reginfo.primary; + + R_LOCK(dbenv, &dbmp->reginfo); + + /* + * Allocate room for a list of buffers, and decide how many buffers + * we can pin down. + * + * !!! + * Note: CDB___memp_sballoc has released our region lock if we're not + * continuing forward. + */ + if ((ret = + CDB___memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) + return (ret); + + retry_done = 0; +retry: retry_need = 0; + /* + * Walk each cache's list of buffers and mark all dirty buffers to be + * written and all pinned buffers to be potentially written (we can't + * know if they'll need to be written until the holder returns them to + * the cache). We do this in one pass while holding the region locked + * so that processes can't make new buffers dirty, causing us to never + * finish. + */ + mf_offset = R_OFFSET(&dbmp->reginfo, dbmfp->mfp); + for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nc_reg; ++i) { + mc = dbmp->c_reginfo[i].primary; + + for (bhp = SH_TAILQ_FIRST(&mc->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + if (!F_ISSET(bhp, BH_DIRTY) || + bhp->mf_offset != mf_offset) + continue; + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { + incomplete = 1; + continue; + } + + /* + * If the buffer isn't being used, we can write + * it immediately, so increment its reference + * count to lock it down, and save a reference + * to it. + * + * If we've run out space to store buffer refs, + * we're screwed. We don't want to realloc the + * array while holding a region lock, so we set + * a flag and deal with it later. + */ + ++bhp->ref; + bharray[ar_cnt] = bhp; + if (++ar_cnt >= ndirty) { + retry_need = 1; + break; + } + } + if (ar_cnt >= ndirty) + break; + } + + /* If there no buffers we can write immediately, we're done. */ + if (ar_cnt == 0) { + ret = 0; + goto done; + } + + R_UNLOCK(dbenv, &dbmp->reginfo); + + /* Sort the buffers we're going to write. */ + if (ar_cnt > 1) + qsort(bharray, ar_cnt, sizeof(BH *), CDB___bhcmp); + + R_LOCK(dbenv, &dbmp->reginfo); + + /* Walk the array, writing buffers. */ + for (i = 0; i < ar_cnt;) { + /* + * It's possible for a thread to have gotten the buffer since + * we listed it for writing. If the reference count is still + * 1, we're the only ones using the buffer, go ahead and write. + * If it's >1, then skip the buffer and assume that it will be + * written when it's returned to the cache. + */ + if (bharray[i]->ref > 1) { + incomplete = 1; + --bharray[i++]->ref; + continue; + } + + /* Write the buffer. */ + ret = CDB___memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote); + + /* Release the buffer. */ + --bharray[i++]->ref; + + if (ret == 0) { + if (!wrote) + incomplete = 1; + continue; + } + + /* + * On error: + * + * Release any buffers we're still pinning down. + */ + while (i < ar_cnt) + --bharray[i++]->ref; + break; + } + + /* + * If there were too many buffers and we're not returning an error, we + * re-try the flush once -- since we allocated 80% of the total + * buffer count, once should be enough. If it still doesn't work, some + * other thread of control is dirtying buffers as fast as we're writing + * them, and we might as well give up. + */ + if (retry_need) { + if (retry_done) + incomplete = 1; + else { + retry_done = 1; + goto retry; + } + } + +done: R_UNLOCK(dbenv, &dbmp->reginfo); + + CDB___os_free(bharray, ndirty * sizeof(BH *)); + + /* + * Sync the underlying file as the last thing we do, so that the OS + * has a maximal opportunity to flush buffers before we request it. + * + * !!!: + * Don't lock the region around the sync, fsync(2) has no atomicity + * issues. + */ + if (ret == 0) + ret = incomplete ? DB_INCOMPLETE : CDB___os_fsync(&dbmfp->fh); + + return (ret); +} + +/* + * CDB___memp_sballoc -- + * Allocate room for a list of buffers. + */ +static int +CDB___memp_sballoc(dbenv, bharrayp, ndirtyp) + DB_ENV *dbenv; + BH ***bharrayp; + u_int32_t *ndirtyp; +{ + DB_MPOOL *dbmp; + MCACHE *mc; + MPOOL *mp; + u_int32_t i, nclean, ndirty, maxpin; + int ret; + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo.primary; + + /* + * We don't want to hold the region lock while we write the buffers, + * so only lock it while we create a list. + * + * Walk through the list of caches, figuring out how many buffers + * we're going to need. + * + * Make a point of not holding the region lock across the library + * allocation call. + */ + for (nclean = ndirty = 0, i = 0; i < mp->nc_reg; ++i) { + mc = dbmp->c_reginfo[i].primary; + ndirty += mc->stat.st_page_dirty; + nclean += mc->stat.st_page_clean; + } + R_UNLOCK(dbenv, &dbmp->reginfo); + if (ndirty == 0) { + *ndirtyp = 0; + return (0); + } + + /* + * We don't want to pin down the entire buffer cache, otherwise we'll + * starve threads needing new pages. Don't pin down more than 80% of + * the cache, making sure that we don't screw up just because only a + * few pages have been created. + */ + maxpin = ((ndirty + nclean) * 8) / 10; + if (maxpin < 10) + maxpin = 10; + + /* + * Get a good-sized block of memory to hold buffer pointers, we don't + * want to run out, but correct if we want to allocate more than we + * would be allowed to store, regardless. + */ + ndirty += ndirty / 2 + 10; + if (ndirty > maxpin) + ndirty = maxpin; + if ((ret = CDB___os_malloc(ndirty * sizeof(BH *), NULL, bharrayp)) != 0) + return (ret); + + *ndirtyp = ndirty; + + R_LOCK(dbenv, &dbmp->reginfo); + + return (0); +} + +static int +CDB___bhcmp(p1, p2) + const void *p1, *p2; +{ + BH *bhp1, *bhp2; + + bhp1 = *(BH * const *)p1; + bhp2 = *(BH * const *)p2; + + /* Sort by file (shared memory pool offset). */ + if (bhp1->mf_offset < bhp2->mf_offset) + return (-1); + if (bhp1->mf_offset > bhp2->mf_offset) + return (1); + + /* + * !!! + * Defend against badly written quicksort code calling the comparison + * function with two identical pointers (e.g., WATCOM C++ (Power++)). + */ + if (bhp1->pgno < bhp2->pgno) + return (-1); + if (bhp1->pgno > bhp2->pgno) + return (1); + return (0); +} |