From: Tom Lane Date: Mon, 10 Mar 2008 20:06:27 +0000 (+0000) Subject: Provide a build-time option to store large relations as single files, rather X-Git-Tag: REL8_4_BETA1~1849 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f0828b2fc3d021ef8d64337a3593eb44bd3b6114;p=postgresql Provide a build-time option to store large relations as single files, rather than dividing them into 1GB segments as has been our longtime practice. This requires working support for large files in the operating system; at least for the time being, it won't be the default. Zdenek Kotala --- diff --git a/configure b/configure index 99b0722ca1..476ce76c8a 100755 --- a/configure +++ b/configure @@ -1357,6 +1357,7 @@ Optional Features: --enable-debug build with debugging symbols (-g) --enable-profiling build with profiling enabled --enable-dtrace build with DTrace support + --disable-segmented-files disable data file segmentation (requires largefile support) --enable-depend turn on automatic dependency tracking --enable-cassert enable assertion checks (for debugging) --enable-thread-safety make client libraries thread-safe @@ -2541,6 +2542,36 @@ fi +# +# Data file segmentation +# + +pgac_args="$pgac_args enable_segmented_files" + +# Check whether --enable-segmented-files was given. +if test "${enable_segmented_files+set}" = set; then + enableval=$enable_segmented_files; + case $enableval in + yes) + : + ;; + no) + : + ;; + *) + { { echo "$as_me:$LINENO: error: no argument expected for --enable-segmented-files option" >&5 +echo "$as_me: error: no argument expected for --enable-segmented-files option" >&2;} + { (exit 1); exit 1; }; } + ;; + esac + +else + enable_segmented_files=yes + +fi + + + # # C compiler # @@ -23642,6 +23673,421 @@ fi fi +# Check for largefile support (must be after AC_SYS_LARGEFILE) +{ echo "$as_me:$LINENO: checking for off_t" >&5 +echo $ECHO_N "checking for off_t... $ECHO_C" >&6; } +if test "${ac_cv_type_off_t+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default +typedef off_t ac__type_new_; +int +main () +{ +if ((ac__type_new_ *) 0) + return 0; +if (sizeof (ac__type_new_)) + return 0; + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_type_off_t=yes +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_cv_type_off_t=no +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ echo "$as_me:$LINENO: result: $ac_cv_type_off_t" >&5 +echo "${ECHO_T}$ac_cv_type_off_t" >&6; } + +# The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ echo "$as_me:$LINENO: checking size of off_t" >&5 +echo $ECHO_N "checking size of off_t... $ECHO_C" >&6; } +if test "${ac_cv_sizeof_off_t+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + if test "$cross_compiling" = yes; then + # Depending upon the size, compute the lo and hi bounds. +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +int +main () +{ +static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) >= 0)]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_lo=0 ac_mid=0 + while :; do + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +int +main () +{ +static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) <= $ac_mid)]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_hi=$ac_mid; break +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_lo=`expr $ac_mid + 1` + if test $ac_lo -le $ac_mid; then + ac_lo= ac_hi= + break + fi + ac_mid=`expr 2 '*' $ac_mid + 1` +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +int +main () +{ +static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) < 0)]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_hi=-1 ac_mid=-1 + while :; do + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +int +main () +{ +static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) >= $ac_mid)]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_lo=$ac_mid; break +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_hi=`expr '(' $ac_mid ')' - 1` + if test $ac_mid -le $ac_hi; then + ac_lo= ac_hi= + break + fi + ac_mid=`expr 2 '*' $ac_mid` +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_lo= ac_hi= +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +# Binary search between lo and hi bounds. +while test "x$ac_lo" != "x$ac_hi"; do + ac_mid=`expr '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo` + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +int +main () +{ +static int test_array [1 - 2 * !(((long int) (sizeof (ac__type_sizeof_))) <= $ac_mid)]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_hi=$ac_mid +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_lo=`expr '(' $ac_mid ')' + 1` +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +done +case $ac_lo in +?*) ac_cv_sizeof_off_t=$ac_lo;; +'') if test "$ac_cv_type_off_t" = yes; then + { { echo "$as_me:$LINENO: error: cannot compute sizeof (off_t) +See \`config.log' for more details." >&5 +echo "$as_me: error: cannot compute sizeof (off_t) +See \`config.log' for more details." >&2;} + { (exit 77); exit 77; }; } + else + ac_cv_sizeof_off_t=0 + fi ;; +esac +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default + typedef off_t ac__type_sizeof_; +static long int longval () { return (long int) (sizeof (ac__type_sizeof_)); } +static unsigned long int ulongval () { return (long int) (sizeof (ac__type_sizeof_)); } +#include +#include +int +main () +{ + + FILE *f = fopen ("conftest.val", "w"); + if (! f) + return 1; + if (((long int) (sizeof (ac__type_sizeof_))) < 0) + { + long int i = longval (); + if (i != ((long int) (sizeof (ac__type_sizeof_)))) + return 1; + fprintf (f, "%ld\n", i); + } + else + { + unsigned long int i = ulongval (); + if (i != ((long int) (sizeof (ac__type_sizeof_)))) + return 1; + fprintf (f, "%lu\n", i); + } + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +rm -f conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { ac_try='./conftest$ac_exeext' + { (case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + ac_cv_sizeof_off_t=`cat conftest.val` +else + echo "$as_me: program exited with status $ac_status" >&5 +echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +( exit $ac_status ) +if test "$ac_cv_type_off_t" = yes; then + { { echo "$as_me:$LINENO: error: cannot compute sizeof (off_t) +See \`config.log' for more details." >&5 +echo "$as_me: error: cannot compute sizeof (off_t) +See \`config.log' for more details." >&2;} + { (exit 77); exit 77; }; } + else + ac_cv_sizeof_off_t=0 + fi +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext +fi +rm -f conftest.val +fi +{ echo "$as_me:$LINENO: result: $ac_cv_sizeof_off_t" >&5 +echo "${ECHO_T}$ac_cv_sizeof_off_t" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_OFF_T $ac_cv_sizeof_off_t +_ACEOF + + + +if test "$ac_cv_sizeof_off_t" -lt 8 -o "$enable_segmented_files" = "yes"; then + +cat >>confdefs.h <<\_ACEOF +#define USE_SEGMENTED_FILES 1 +_ACEOF + +fi + # SunOS doesn't handle negative byte comparisons properly with +/- return { echo "$as_me:$LINENO: checking for working memcmp" >&5 echo $ECHO_N "checking for working memcmp... $ECHO_C" >&6; } diff --git a/configure.in b/configure.in index 2bdc371984..020009785c 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -dnl $PostgreSQL: pgsql/configure.in,v 1.552 2008/02/24 05:21:54 tgl Exp $ +dnl $PostgreSQL: pgsql/configure.in,v 1.553 2008/03/10 20:06:27 tgl Exp $ dnl dnl Developers, please strive to achieve this order: dnl @@ -217,6 +217,12 @@ fi AC_SUBST(DTRACEFLAGS)]) AC_SUBST(enable_dtrace) +# +# Data file segmentation +# +PGAC_ARG_BOOL(enable, segmented-files, yes, + [ --disable-segmented-files disable data file segmentation (requires largefile support)]) + # # C compiler # @@ -1411,6 +1417,13 @@ if test $ac_cv_func_fseeko = yes; then AC_SYS_LARGEFILE fi +# Check for largefile support (must be after AC_SYS_LARGEFILE) +AC_CHECK_SIZEOF([off_t]) + +if test "$ac_cv_sizeof_off_t" -lt 8 -o "$enable_segmented_files" = "yes"; then + AC_DEFINE([USE_SEGMENTED_FILES], 1, [Define to split data files into 1GB segments.]) +fi + # SunOS doesn't handle negative byte comparisons properly with +/- return AC_FUNC_MEMCMP diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index a999002346..95a3f10be6 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -1,4 +1,4 @@ - + <![%standalone-include[<productname>PostgreSQL</>]]> @@ -1025,6 +1025,20 @@ su - postgres </listitem> </varlistentry> + <varlistentry> + <term><option>--disable-segmented-files</option></term> + <listitem> + <para> + Store large tables as single operating-system files, rather than + dividing them into 1GB segments as is the default. This option + is ignored unless the operating system has <quote>largefile</> + support (which most do, nowadays). It can be helpful to reduce + the number of file descriptors consumed when working with very + large tables. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><option>--disable-spinlocks</option></term> <listitem> diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index fe9ae611bf..7ba0c1e343 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.21 2007/11/23 00:24:12 ishii Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.22 2008/03/10 20:06:27 tgl Exp $ --> <chapter id="storage"> @@ -138,10 +138,14 @@ Avoid assuming that filenode and table OID are the same. </caution> <para> -When a table or index exceeds 1 GB, it is divided into gigabyte-sized +When a table or index exceeds 1 GB, it is normally divided into gigabyte-sized <firstterm>segments</>. The first segment's file name is the same as the filenode; subsequent segments are named filenode.1, filenode.2, etc. This arrangement avoids problems on platforms that have file size limitations. +(But if the platform does not have such a limitation, and +<option>--disable-segmented-files</option> was specified when +<productname>PostgreSQL</> was built, then each table or index is stored +as a single file, without segmentation.) The contents of tables and indexes are discussed further in <xref linkend="storage-page-layout">. </para> diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 8d79e9574b..94e5c67911 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.29 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.30 2008/03/10 20:06:27 tgl Exp $ * * NOTES: * @@ -38,13 +38,12 @@ #include "storage/buffile.h" /* - * The maximum safe file size is presumed to be RELSEG_SIZE * BLCKSZ. - * Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE - * is defined, although md.c ignores it when that symbol is defined. - * The reason for doing this is that we'd like large temporary BufFiles - * to be spread across multiple tablespaces when available. + * We break BufFiles into gigabyte-sized segments, whether or not + * USE_SEGMENTED_FILES is defined. The reason is that we'd like large + * temporary BufFiles to be spread across multiple tablespaces when available. */ -#define MAX_PHYSICAL_FILESIZE (RELSEG_SIZE * BLCKSZ) +#define MAX_PHYSICAL_FILESIZE 0x40000000 +#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) /* * This data structure represents a buffered file that consists of one or @@ -56,7 +55,7 @@ struct BufFile int numFiles; /* number of physical files in set */ /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ File *files; /* palloc'd array with numFiles entries */ - long *offsets; /* palloc'd array with numFiles entries */ + off_t *offsets; /* palloc'd array with numFiles entries */ /* * offsets[i] is the current seek position of files[i]. We use this to @@ -72,7 +71,7 @@ struct BufFile * Position as seen by user of BufFile is (curFile, curOffset + pos). */ int curFile; /* file index (0..n) part of current pos */ - int curOffset; /* offset part of current pos */ + off_t curOffset; /* offset part of current pos */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ char buffer[BLCKSZ]; @@ -97,7 +96,7 @@ makeBufFile(File firstfile) file->numFiles = 1; file->files = (File *) palloc(sizeof(File)); file->files[0] = firstfile; - file->offsets = (long *) palloc(sizeof(long)); + file->offsets = (off_t *) palloc(sizeof(off_t)); file->offsets[0] = 0L; file->isTemp = false; file->isInterXact = false; @@ -124,8 +123,8 @@ extendBufFile(BufFile *file) file->files = (File *) repalloc(file->files, (file->numFiles + 1) * sizeof(File)); - file->offsets = (long *) repalloc(file->offsets, - (file->numFiles + 1) * sizeof(long)); + file->offsets = (off_t *) repalloc(file->offsets, + (file->numFiles + 1) * sizeof(off_t)); file->files[file->numFiles] = pfile; file->offsets[file->numFiles] = 0L; file->numFiles++; @@ -279,9 +278,9 @@ BufFileDumpBuffer(BufFile *file) bytestowrite = file->nbytes - wpos; if (file->isTemp) { - long availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; + off_t availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; - if ((long) bytestowrite > availbytes) + if ((off_t) bytestowrite > availbytes) bytestowrite = (int) availbytes; } @@ -451,10 +450,10 @@ BufFileFlush(BufFile *file) * impossible seek is attempted. */ int -BufFileSeek(BufFile *file, int fileno, long offset, int whence) +BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) { int newFile; - long newOffset; + off_t newOffset; switch (whence) { @@ -469,7 +468,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence) /* * Relative seek considers only the signed offset, ignoring * fileno. Note that large offsets (> 1 gig) risk overflow in this - * add... + * add, unless we have 64-bit off_t. */ newFile = file->curFile; newOffset = (file->curOffset + file->pos) + offset; @@ -537,7 +536,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence) } void -BufFileTell(BufFile *file, int *fileno, long *offset) +BufFileTell(BufFile *file, int *fileno, off_t *offset) { *fileno = file->curFile; *offset = file->curOffset + file->pos; @@ -558,8 +557,8 @@ int BufFileSeekBlock(BufFile *file, long blknum) { return BufFileSeek(file, - (int) (blknum / RELSEG_SIZE), - (blknum % RELSEG_SIZE) * BLCKSZ, + (int) (blknum / BUFFILE_SEG_SIZE), + (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, SEEK_SET); } @@ -575,7 +574,7 @@ BufFileTellBlock(BufFile *file) long blknum; blknum = (file->curOffset + file->pos) / BLCKSZ; - blknum += file->curFile * RELSEG_SIZE; + blknum += file->curFile * BUFFILE_SEG_SIZE; return blknum; } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 2a0108fcee..edce52155f 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.143 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.144 2008/03/10 20:06:27 tgl Exp $ * * NOTES: * @@ -115,7 +115,7 @@ static int max_safe_fds = 32; /* default if not changed */ #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) -#define FileUnknownPos (-1L) +#define FileUnknownPos ((off_t) -1) /* these are the assigned bits in fdstate below: */ #define FD_TEMPORARY (1 << 0) /* T = delete when closed */ @@ -123,13 +123,13 @@ static int max_safe_fds = 32; /* default if not changed */ typedef struct vfd { - signed short fd; /* current FD, or VFD_CLOSED if none */ + int fd; /* current FD, or VFD_CLOSED if none */ unsigned short fdstate; /* bitflags for VFD's state */ - SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */ + SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */ File nextFree; /* link to next free VFD, if in freelist */ File lruMoreRecently; /* doubly linked recency-of-use list */ File lruLessRecently; - long seekPos; /* current logical file position */ + off_t seekPos; /* current logical file position */ char *fileName; /* name of file, or NULL for unused VFD */ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ int fileFlags; /* open(2) flags for (re)opening the file */ @@ -544,8 +544,8 @@ LruDelete(File file) Delete(file); /* save the seek position */ - vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR); - Assert(vfdP->seekPos != -1L); + vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR); + Assert(vfdP->seekPos != (off_t) -1); /* close the file */ if (close(vfdP->fd)) @@ -616,12 +616,12 @@ LruInsert(File file) } /* seek to the right position */ - if (vfdP->seekPos != 0L) + if (vfdP->seekPos != (off_t) 0) { - long returnValue; + off_t returnValue; - returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET); - Assert(returnValue != -1L); + returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET); + Assert(returnValue != (off_t) -1); } } @@ -1027,9 +1027,10 @@ FileRead(File file, char *buffer, int amount) Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p", + DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p", file, VfdCache[file].fileName, - VfdCache[file].seekPos, amount, buffer)); + (int64) VfdCache[file].seekPos, + amount, buffer)); returnCode = FileAccess(file); if (returnCode < 0) @@ -1081,9 +1082,10 @@ FileWrite(File file, char *buffer, int amount) Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p", + DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", file, VfdCache[file].fileName, - VfdCache[file].seekPos, amount, buffer)); + (int64) VfdCache[file].seekPos, + amount, buffer)); returnCode = FileAccess(file); if (returnCode < 0) @@ -1146,16 +1148,17 @@ FileSync(File file) return pg_fsync(VfdCache[file].fd); } -long -FileSeek(File file, long offset, int whence) +off_t +FileSeek(File file, off_t offset, int whence) { int returnCode; Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d", + DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d", file, VfdCache[file].fileName, - VfdCache[file].seekPos, offset, whence)); + (int64) VfdCache[file].seekPos, + (int64) offset, whence)); if (FileIsNotOpen(file)) { @@ -1163,7 +1166,8 @@ FileSeek(File file, long offset, int whence) { case SEEK_SET: if (offset < 0) - elog(ERROR, "invalid seek offset: %ld", offset); + elog(ERROR, "invalid seek offset: " INT64_FORMAT, + (int64) offset); VfdCache[file].seekPos = offset; break; case SEEK_CUR: @@ -1187,7 +1191,8 @@ FileSeek(File file, long offset, int whence) { case SEEK_SET: if (offset < 0) - elog(ERROR, "invalid seek offset: %ld", offset); + elog(ERROR, "invalid seek offset: " INT64_FORMAT, + (int64) offset); if (VfdCache[file].seekPos != offset) VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); @@ -1213,7 +1218,7 @@ FileSeek(File file, long offset, int whence) * XXX not actually used but here for completeness */ #ifdef NOT_USED -long +off_t FileTell(File file) { Assert(FileIsValid(file)); @@ -1224,7 +1229,7 @@ FileTell(File file) #endif int -FileTruncate(File file, long offset) +FileTruncate(File file, off_t offset) { int returnCode; @@ -1237,7 +1242,7 @@ FileTruncate(File file, long offset) if (returnCode < 0) return returnCode; - returnCode = ftruncate(VfdCache[file].fd, (size_t) offset); + returnCode = ftruncate(VfdCache[file].fd, offset); return returnCode; } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 543574be40..6ea4a00b01 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.135 2008/01/01 19:45:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.136 2008/03/10 20:06:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -89,16 +89,16 @@ * * All MdfdVec objects are palloc'd in the MdCxt memory context. * - * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic, - * for use on machines that support large files. Beware that that - * code has not been tested in a long time and is probably bit-rotted. + * On platforms that support large files, USE_SEGMENTED_FILES can be + * #undef'd to disable the segmentation logic. In that case each + * relation is a single operating-system file. */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */ -#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */ +#ifdef USE_SEGMENTED_FILES struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ #endif } MdfdVec; @@ -162,7 +162,7 @@ static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg); static void register_unlink(RelFileNode rnode); static MdfdVec *_fdvec_alloc(void); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags); #endif @@ -258,7 +258,7 @@ mdcreate(SMgrRelation reln, bool isRedo) reln->md_fd->mdfd_vfd = fd; reln->md_fd->mdfd_segno = 0; -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES reln->md_fd->mdfd_chain = NULL; #endif } @@ -344,7 +344,7 @@ mdunlink(RelFileNode rnode, bool isRedo) rnode.relNode))); } -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES /* Delete the additional segments, if any */ else { @@ -395,7 +395,7 @@ mdunlink(RelFileNode rnode, bool isRedo) void mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) { - long seekpos; + off_t seekpos; int nbytes; MdfdVec *v; @@ -420,11 +420,11 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE); -#ifndef LET_OS_MANAGE_FILESIZE - seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); - Assert(seekpos < BLCKSZ * RELSEG_SIZE); +#ifdef USE_SEGMENTED_FILES + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); #else - seekpos = (long) (BLCKSZ * (blocknum)); + seekpos = (off_t) BLCKSZ * blocknum; #endif /* @@ -469,7 +469,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) if (!isTemp) register_dirty_segment(reln, v); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE)); #endif } @@ -530,7 +530,7 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior) mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES mdfd->mdfd_chain = NULL; Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE)); #endif @@ -552,7 +552,7 @@ mdclose(SMgrRelation reln) reln->md_fd = NULL; /* prevent dangling pointer after error */ -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES while (v != NULL) { MdfdVec *ov = v; @@ -577,17 +577,17 @@ mdclose(SMgrRelation reln) void mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer) { - long seekpos; + off_t seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL); -#ifndef LET_OS_MANAGE_FILESIZE - seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); - Assert(seekpos < BLCKSZ * RELSEG_SIZE); +#ifdef USE_SEGMENTED_FILES + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); #else - seekpos = (long) (BLCKSZ * (blocknum)); + seekpos = (off_t) BLCKSZ * blocknum; #endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) @@ -642,7 +642,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer) void mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) { - long seekpos; + off_t seekpos; int nbytes; MdfdVec *v; @@ -653,11 +653,11 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL); -#ifndef LET_OS_MANAGE_FILESIZE - seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); - Assert(seekpos < BLCKSZ * RELSEG_SIZE); +#ifdef USE_SEGMENTED_FILES + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); #else - seekpos = (long) (BLCKSZ * (blocknum)); + seekpos = (off_t) BLCKSZ * blocknum; #endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) @@ -708,7 +708,7 @@ mdnblocks(SMgrRelation reln) { MdfdVec *v = mdopen(reln, EXTENSION_FAIL); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES BlockNumber nblocks; BlockNumber segno = 0; @@ -778,7 +778,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) MdfdVec *v; BlockNumber curnblk; -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES BlockNumber priorblocks; #endif @@ -804,7 +804,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) v = mdopen(reln, EXTENSION_FAIL); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES priorblocks = 0; while (v != NULL) { @@ -843,7 +843,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0) + if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", @@ -867,7 +867,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) priorblocks += RELSEG_SIZE; } #else - if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0) + /* For unsegmented files, it's a lot easier */ + if (FileTruncate(v->mdfd_vfd, (off_t) nblocks * BLCKSZ) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", @@ -900,7 +901,7 @@ mdimmedsync(SMgrRelation reln) v = mdopen(reln, EXTENSION_FAIL); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES while (v != NULL) { if (FileSync(v->mdfd_vfd) < 0) @@ -917,8 +918,7 @@ mdimmedsync(SMgrRelation reln) if (FileSync(v->mdfd_vfd) < 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not fsync segment %u of relation %u/%u/%u: %m", - v->mdfd_segno, + errmsg("could not fsync relation %u/%u/%u: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode))); @@ -1453,7 +1453,7 @@ _fdvec_alloc(void) return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); } -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES /* * Open the specified segment of the relation, @@ -1499,7 +1499,7 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags) /* all done */ return v; } -#endif /* LET_OS_MANAGE_FILESIZE */ +#endif /* USE_SEGMENTED_FILES */ /* * _mdfd_getseg() -- Find the segment of the relation holding the @@ -1515,7 +1515,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp, { MdfdVec *v = mdopen(reln, behavior); -#ifndef LET_OS_MANAGE_FILESIZE +#ifdef USE_SEGMENTED_FILES BlockNumber targetseg; BlockNumber nextsegno; @@ -1588,7 +1588,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp, static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg) { - long len; + off_t len; len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END); if (len < 0) diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index e297579674..d6c192993e 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -38,7 +38,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.36 2008/01/01 19:45:55 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.37 2008/03/10 20:06:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -134,14 +134,14 @@ struct Tuplestorestate bool eof_reached; /* read reached EOF (always valid) */ int current; /* next array index (valid if INMEM) */ int readpos_file; /* file# (valid if WRITEFILE and not eof) */ - long readpos_offset; /* offset (valid if WRITEFILE and not eof) */ + off_t readpos_offset; /* offset (valid if WRITEFILE and not eof) */ int writepos_file; /* file# (valid if READFILE) */ - long writepos_offset; /* offset (valid if READFILE) */ + off_t writepos_offset; /* offset (valid if READFILE) */ /* markpos_xxx holds marked position for mark and restore */ int markpos_current; /* saved "current" */ int markpos_file; /* saved "readpos_file" */ - long markpos_offset; /* saved "readpos_offset" */ + off_t markpos_offset; /* saved "readpos_offset" */ }; #define COPYTUP(state,tup) ((*(state)->copytup) (state, tup)) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index b48e261cbf..24b7c0dc86 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -637,6 +637,9 @@ your system. */ #undef PTHREAD_CREATE_JOINABLE +/* The size of `off_t', as computed by sizeof. */ +#undef SIZEOF_OFF_T + /* The size of `size_t', as computed by sizeof. */ #undef SIZEOF_SIZE_T @@ -685,6 +688,9 @@ /* Use replacement snprintf() functions. */ #undef USE_REPL_SNPRINTF +/* Define to split data files into 1GB segments. */ +#undef USE_SEGMENTED_FILES + /* Define to build with (Open)SSL support. (--with-openssl) */ #undef USE_SSL diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 0a1e8233d3..c0d546761a 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -6,7 +6,7 @@ * for developers. If you edit any of these, be sure to do a *full* * rebuild (and an initdb if noted). * - * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.28 2008/02/29 20:58:33 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.29 2008/03/10 20:06:27 tgl Exp $ *------------------------------------------------------------------------ */ @@ -27,8 +27,9 @@ /* * RELSEG_SIZE is the maximum number of blocks allowed in one disk - * file. Thus, the maximum size of a single file is RELSEG_SIZE * - * BLCKSZ; relations bigger than that are divided into multiple files. + * file when USE_SEGMENTED_FILES is defined. Thus, the maximum size + * of a single file is RELSEG_SIZE * BLCKSZ; relations bigger than that + * are divided into multiple files. * * RELSEG_SIZE * BLCKSZ must be less than your OS' limit on file size. * This is often 2 GB or 4GB in a 32-bit operating system, unless you @@ -39,9 +40,16 @@ * in the direction of a small limit. (Besides, a power-of-2 value * saves a few cycles in md.c.) * + * When not using segmented files, RELSEG_SIZE is set to zero so that + * this behavior can be distinguished in pg_control. + * * Changing RELSEG_SIZE requires an initdb. */ +#ifdef USE_SEGMENTED_FILES #define RELSEG_SIZE (0x40000000 / BLCKSZ) +#else +#define RELSEG_SIZE 0 +#endif /* * Size of a WAL file block. This need have no particular relation to BLCKSZ. diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 3313e43ea0..e50ec2f834 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -18,7 +18,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/buffile.h,v 1.23 2008/01/01 19:45:58 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/buffile.h,v 1.24 2008/03/10 20:06:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -38,8 +38,8 @@ extern BufFile *BufFileCreateTemp(bool interXact); extern void BufFileClose(BufFile *file); extern size_t BufFileRead(BufFile *file, void *ptr, size_t size); extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size); -extern int BufFileSeek(BufFile *file, int fileno, long offset, int whence); -extern void BufFileTell(BufFile *file, int *fileno, long *offset); +extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); +extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern int BufFileSeekBlock(BufFile *file, long blknum); #endif /* BUFFILE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index f5862bf82b..05c2a62525 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.61 2008/01/01 19:45:58 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.62 2008/03/10 20:06:27 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -65,8 +65,8 @@ extern void FileClose(File file); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); extern int FileSync(File file); -extern long FileSeek(File file, long offset, int whence); -extern int FileTruncate(File file, long offset); +extern off_t FileSeek(File file, off_t offset, int whence); +extern int FileTruncate(File file, off_t offset); /* Operations that allow use of regular stdio --- USE WITH CAUTION */ extern FILE *AllocateFile(const char *name, const char *mode);