From f089c36419adcfc23b3a83560dc67633b1e7b939 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Mon, 29 May 2000 17:44:17 +0000 Subject: [PATCH] Add analyze.c file for split. --- src/backend/commands/analyze.c | 635 +++++++++++++++++++++++++++++++++ 1 file changed, 635 insertions(+) create mode 100644 src/backend/commands/analyze.c diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c new file mode 100644 index 0000000000..9428ba1ee5 --- /dev/null +++ b/src/backend/commands/analyze.c @@ -0,0 +1,635 @@ +/*------------------------------------------------------------------------- + * + * analyze.c + * the postgres optimizer analyzer + * + * Portions Copyright (c) 1996-2000, PostgreSQL, Inc + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.1 2000/05/29 17:44:17 momjian Exp $ + * + + *------------------------------------------------------------------------- + */ +#include +#include +#include +#include +#include + +#include "postgres.h" + +#include "access/heapam.h" +#include "catalog/catname.h" +#include "catalog/indexing.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_type.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "parser/parse_oper.h" +#include "tcop/tcopprot.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/syscache.h" + +#define swapLong(a,b) {long tmp; tmp=a; a=b; b=tmp;} +#define swapInt(a,b) {int tmp; tmp=a; a=b; b=tmp;} +#define swapDatum(a,b) {Datum tmp; tmp=a; a=b; b=tmp;} +#define VacAttrStatsEqValid(stats) ( stats->f_cmpeq.fn_addr != NULL ) +#define VacAttrStatsLtGtValid(stats) ( stats->f_cmplt.fn_addr != NULL && \ + stats->f_cmpgt.fn_addr != NULL && \ + RegProcedureIsValid(stats->outfunc) ) + + +static void attr_stats(Relation onerel, int attr_cnt, VacAttrStats *vacattrstats, HeapTuple tuple); +static void bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len); +static void update_attstats(Oid relid, int natts, VacAttrStats *vacattrstats); +static void del_stats(Oid relid, int attcnt, int *attnums); + + +/* + * analyze_rel() -- analyze relation + */ +void +analyze_rel(Oid relid, List *anal_cols2, int MESSAGE_LEVEL) +{ + HeapTuple tuple, + typetuple; + Relation onerel; + int32 i; + int attr_cnt, + *attnums = NULL; + Form_pg_attribute *attr; + VacAttrStats *vacattrstats; + HeapScanDesc scan; + + StartTransactionCommand(); + + /* + * Check for user-requested abort. Note we want this to be inside a + * transaction, so xact.c doesn't issue useless NOTICE. + */ + if (QueryCancel) + CancelQuery(); + + /* + * Race condition -- if the pg_class tuple has gone away since the + * last time we saw it, we don't need to vacuum it. + */ + tuple = SearchSysCacheTuple(RELOID, + ObjectIdGetDatum(relid), + 0, 0, 0); + /* + * We can VACUUM ANALYZE any table except pg_statistic. + * see update_relstats + */ + if (!HeapTupleIsValid(tuple) || + strcmp(NameStr(((Form_pg_class) GETSTRUCT(tuple))->relname), + StatisticRelationName) == 0) + { + CommitTransactionCommand(); + return; + } + + onerel = heap_open(relid, AccessShareLock); + +#ifndef NO_SECURITY + if (!pg_ownercheck(GetPgUserName(), RelationGetRelationName(onerel), + RELNAME)) + { + /* we already did an elog during vacuum + elog(NOTICE, "Skipping \"%s\" --- only table owner can VACUUM it", + RelationGetRelationName(onerel)); + */ + heap_close(onerel, AccessExclusiveLock); + CommitTransactionCommand(); + return; + } +#endif + + elog(MESSAGE_LEVEL, "Analyzing..."); + + attr_cnt = onerel->rd_att->natts; + attr = onerel->rd_att->attrs; + + if (anal_cols2 != NIL) + { + int tcnt = 0; + List *le; + + if (length(anal_cols2) > attr_cnt) + elog(ERROR, "vacuum: too many attributes specified for relation %s", + RelationGetRelationName(onerel)); + attnums = (int *) palloc(attr_cnt * sizeof(int)); + foreach(le, anal_cols2) + { + char *col = (char *) lfirst(le); + + for (i = 0; i < attr_cnt; i++) + { + if (namestrcmp(&(attr[i]->attname), col) == 0) + break; + } + if (i < attr_cnt) /* found */ + attnums[tcnt++] = i; + else + { + elog(ERROR, "vacuum: there is no attribute %s in %s", + col, RelationGetRelationName(onerel)); + } + } + attr_cnt = tcnt; + } + + vacattrstats = (VacAttrStats *) palloc(attr_cnt * sizeof(VacAttrStats)); + + for (i = 0; i < attr_cnt; i++) + { + Operator func_operator; + Form_pg_operator pgopform; + VacAttrStats *stats; + + stats = &vacattrstats[i]; + stats->attr = palloc(ATTRIBUTE_TUPLE_SIZE); + memmove(stats->attr, attr[((attnums) ? attnums[i] : i)], ATTRIBUTE_TUPLE_SIZE); + stats->best = stats->guess1 = stats->guess2 = 0; + stats->max = stats->min = 0; + stats->best_len = stats->guess1_len = stats->guess2_len = 0; + stats->max_len = stats->min_len = 0; + stats->initialized = false; + stats->best_cnt = stats->guess1_cnt = stats->guess1_hits = stats->guess2_hits = 0; + stats->max_cnt = stats->min_cnt = stats->null_cnt = stats->nonnull_cnt = 0; + + func_operator = oper("=", stats->attr->atttypid, stats->attr->atttypid, true); + if (func_operator != NULL) + { + pgopform = (Form_pg_operator) GETSTRUCT(func_operator); + fmgr_info(pgopform->oprcode, &(stats->f_cmpeq)); + } + else + stats->f_cmpeq.fn_addr = NULL; + + func_operator = oper("<", stats->attr->atttypid, stats->attr->atttypid, true); + if (func_operator != NULL) + { + pgopform = (Form_pg_operator) GETSTRUCT(func_operator); + fmgr_info(pgopform->oprcode, &(stats->f_cmplt)); + stats->op_cmplt = oprid(func_operator); + } + else + { + stats->f_cmplt.fn_addr = NULL; + stats->op_cmplt = InvalidOid; + } + + func_operator = oper(">", stats->attr->atttypid, stats->attr->atttypid, true); + if (func_operator != NULL) + { + pgopform = (Form_pg_operator) GETSTRUCT(func_operator); + fmgr_info(pgopform->oprcode, &(stats->f_cmpgt)); + } + else + stats->f_cmpgt.fn_addr = NULL; + + typetuple = SearchSysCacheTuple(TYPEOID, + ObjectIdGetDatum(stats->attr->atttypid), + 0, 0, 0); + if (HeapTupleIsValid(typetuple)) + { + stats->outfunc = ((Form_pg_type) GETSTRUCT(typetuple))->typoutput; + stats->typelem = ((Form_pg_type) GETSTRUCT(typetuple))->typelem; + } + else + { + stats->outfunc = InvalidOid; + stats->typelem = InvalidOid; + } + } + /* delete existing pg_statistic rows for relation */ + del_stats(relid, ((attnums) ? attr_cnt : 0), attnums); + + scan = heap_beginscan(onerel, false, SnapshotNow, 0, NULL); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, 0))) + attr_stats(onerel, attr_cnt, vacattrstats, tuple); + + heap_endscan(scan); + + heap_close(onerel, AccessShareLock); + + /* update statistics in pg_class */ + update_attstats(relid, attr_cnt, vacattrstats); + + CommitTransactionCommand(); +} + +/* + * attr_stats() -- compute column statistics used by the optimzer + * + * We compute the column min, max, null and non-null counts. + * Plus we attempt to find the count of the value that occurs most + * frequently in each column. These figures are used to compute + * the selectivity of the column. + * + * We use a three-bucked cache to get the most frequent item. + * The 'guess' buckets count hits. A cache miss causes guess1 + * to get the most hit 'guess' item in the most recent cycle, and + * the new item goes into guess2. Whenever the total count of hits + * of a 'guess' entry is larger than 'best', 'guess' becomes 'best'. + * + * This method works perfectly for columns with unique values, and columns + * with only two unique values, plus nulls. + * + * It becomes less perfect as the number of unique values increases and + * their distribution in the table becomes more random. + * + */ +static void +attr_stats(Relation onerel, int attr_cnt, VacAttrStats *vacattrstats, HeapTuple tuple) +{ + int i; + TupleDesc tupDesc = onerel->rd_att; + Datum value; + bool isnull; + + for (i = 0; i < attr_cnt; i++) + { + VacAttrStats *stats = &vacattrstats[i]; + bool value_hit = true; + +#ifdef _DROP_COLUMN_HACK__ + if (COLUMN_IS_DROPPED(stats->attr)) + continue; +#endif /* _DROP_COLUMN_HACK__ */ + value = heap_getattr(tuple, + stats->attr->attnum, tupDesc, &isnull); + + if (!VacAttrStatsEqValid(stats)) + continue; + + if (isnull) + stats->null_cnt++; + else + { + stats->nonnull_cnt++; + if (stats->initialized == false) + { + bucketcpy(stats->attr, value, &stats->best, &stats->best_len); + /* best_cnt gets incremented later */ + bucketcpy(stats->attr, value, &stats->guess1, &stats->guess1_len); + stats->guess1_cnt = stats->guess1_hits = 1; + bucketcpy(stats->attr, value, &stats->guess2, &stats->guess2_len); + stats->guess2_hits = 1; + if (VacAttrStatsLtGtValid(stats)) + { + bucketcpy(stats->attr, value, &stats->max, &stats->max_len); + bucketcpy(stats->attr, value, &stats->min, &stats->min_len); + } + stats->initialized = true; + } + if (VacAttrStatsLtGtValid(stats)) + { + if ((*fmgr_faddr(&stats->f_cmplt)) (value, stats->min)) + { + bucketcpy(stats->attr, value, &stats->min, &stats->min_len); + stats->min_cnt = 0; + } + if ((*fmgr_faddr(&stats->f_cmpgt)) (value, stats->max)) + { + bucketcpy(stats->attr, value, &stats->max, &stats->max_len); + stats->max_cnt = 0; + } + if ((*fmgr_faddr(&stats->f_cmpeq)) (value, stats->min)) + stats->min_cnt++; + else if ((*fmgr_faddr(&stats->f_cmpeq)) (value, stats->max)) + stats->max_cnt++; + } + if ((*fmgr_faddr(&stats->f_cmpeq)) (value, stats->best)) + stats->best_cnt++; + else if ((*fmgr_faddr(&stats->f_cmpeq)) (value, stats->guess1)) + { + stats->guess1_cnt++; + stats->guess1_hits++; + } + else if ((*fmgr_faddr(&stats->f_cmpeq)) (value, stats->guess2)) + stats->guess2_hits++; + else + value_hit = false; + + if (stats->guess2_hits > stats->guess1_hits) + { + swapDatum(stats->guess1, stats->guess2); + swapInt(stats->guess1_len, stats->guess2_len); + swapLong(stats->guess1_hits, stats->guess2_hits); + stats->guess1_cnt = stats->guess1_hits; + } + if (stats->guess1_cnt > stats->best_cnt) + { + swapDatum(stats->best, stats->guess1); + swapInt(stats->best_len, stats->guess1_len); + swapLong(stats->best_cnt, stats->guess1_cnt); + stats->guess1_hits = 1; + stats->guess2_hits = 1; + } + if (!value_hit) + { + bucketcpy(stats->attr, value, &stats->guess2, &stats->guess2_len); + stats->guess1_hits = 1; + stats->guess2_hits = 1; + } + } + } + return; +} + +/* + * bucketcpy() -- update pg_class statistics for one relation + * + */ +static void +bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len) +{ + if (attr->attbyval && attr->attlen != -1) + *bucket = value; + else + { + int len = (attr->attlen != -1 ? attr->attlen : VARSIZE(value)); + + if (len > *bucket_len) + { + if (*bucket_len != 0) + pfree(DatumGetPointer(*bucket)); + *bucket = PointerGetDatum(palloc(len)); + *bucket_len = len; + } + memmove(DatumGetPointer(*bucket), DatumGetPointer(value), len); + } +} + + +/* + * update_attstats() -- update attribute statistics for one relation + * + * Updates of pg_attribute statistics are handled by over-write. + * for reasons described above. + * + * To keep things simple, we punt for pg_statistic, and don't try + * to compute or store rows for pg_statistic itself in pg_statistic. + * This could possibly be made to work, but it's not worth the trouble. + */ +static void +update_attstats(Oid relid, int natts, VacAttrStats *vacattrstats) +{ + Relation ad, + sd; + HeapScanDesc scan; + HeapTuple atup, + stup; + ScanKeyData askey; + Form_pg_attribute attp; + + ad = heap_openr(AttributeRelationName, RowExclusiveLock); + sd = heap_openr(StatisticRelationName, RowExclusiveLock); + + /* Find pg_attribute rows for this relation */ + ScanKeyEntryInitialize(&askey, 0, Anum_pg_attribute_attrelid, + F_INT4EQ, relid); + + scan = heap_beginscan(ad, false, SnapshotNow, 1, &askey); + + while (HeapTupleIsValid(atup = heap_getnext(scan, 0))) + { + int i; + VacAttrStats *stats; + + attp = (Form_pg_attribute) GETSTRUCT(atup); + if (attp->attnum <= 0) /* skip system attributes for now */ + continue; + + for (i = 0; i < natts; i++) + { + if (attp->attnum == vacattrstats[i].attr->attnum) + break; + } + if (i >= natts) + continue; /* skip attr if no stats collected */ + stats = &(vacattrstats[i]); + + if (VacAttrStatsEqValid(stats)) + { + float32data selratio; /* average ratio of rows selected + * for a random constant */ + + /* Compute disbursion */ + if (stats->nonnull_cnt == 0 && stats->null_cnt == 0) + { + + /* + * empty relation, so put a dummy value in + * attdisbursion + */ + selratio = 0; + } + else if (stats->null_cnt <= 1 && stats->best_cnt == 1) + { + /* + * looks like we have a unique-key attribute --- flag + * this with special -1.0 flag value. + * + * The correct disbursion is 1.0/numberOfRows, but since + * the relation row count can get updated without + * recomputing disbursion, we want to store a + * "symbolic" value and figure 1.0/numberOfRows on the + * fly. + */ + selratio = -1; + } + else + { + if (VacAttrStatsLtGtValid(stats) && + stats->min_cnt + stats->max_cnt == stats->nonnull_cnt) + { + + /* + * exact result when there are just 1 or 2 + * values... + */ + double min_cnt_d = stats->min_cnt, + max_cnt_d = stats->max_cnt, + null_cnt_d = stats->null_cnt; + double total = ((double) stats->nonnull_cnt) + null_cnt_d; + + selratio = (min_cnt_d * min_cnt_d + max_cnt_d * max_cnt_d + null_cnt_d * null_cnt_d) / (total * total); + } + else + { + double most = (double) (stats->best_cnt > stats->null_cnt ? stats->best_cnt : stats->null_cnt); + double total = ((double) stats->nonnull_cnt) + ((double) stats->null_cnt); + + /* + * we assume count of other values are 20% of best + * count in table + */ + selratio = (most * most + 0.20 * most * (total - most)) / (total * total); + } + /* Make sure calculated values are in-range */ + if (selratio < 0.0) + selratio = 0.0; + else if (selratio > 1.0) + selratio = 1.0; + } + + /* overwrite the existing statistics in the tuple */ + attp->attdisbursion = selratio; + + /* invalidate the tuple in the cache and write the buffer */ + RelationInvalidateHeapTuple(ad, atup); + WriteNoReleaseBuffer(scan->rs_cbuf); + + /* + * Create pg_statistic tuples for the relation, if we have + * gathered the right data. del_stats() previously + * deleted all the pg_statistic tuples for the rel, so we + * just have to insert new ones here. + * + * Note vacuum_rel() has seen to it that we won't come here + * when vacuuming pg_statistic itself. + */ + if (VacAttrStatsLtGtValid(stats) && stats->initialized) + { + float32data nullratio; + float32data bestratio; + FmgrInfo out_function; + char *out_string; + double best_cnt_d = stats->best_cnt, + null_cnt_d = stats->null_cnt, + nonnull_cnt_d = stats->nonnull_cnt; /* prevent overflow */ + Datum values[Natts_pg_statistic]; + char nulls[Natts_pg_statistic]; + + nullratio = null_cnt_d / (nonnull_cnt_d + null_cnt_d); + bestratio = best_cnt_d / (nonnull_cnt_d + null_cnt_d); + + fmgr_info(stats->outfunc, &out_function); + + for (i = 0; i < Natts_pg_statistic; ++i) + nulls[i] = ' '; + + /* ---------------- + * initialize values[] + * ---------------- + */ + i = 0; + values[i++] = (Datum) relid; /* starelid */ + values[i++] = (Datum) attp->attnum; /* staattnum */ + values[i++] = (Datum) stats->op_cmplt; /* staop */ + /* hack: this code knows float4 is pass-by-ref */ + values[i++] = PointerGetDatum(&nullratio); /* stanullfrac */ + values[i++] = PointerGetDatum(&bestratio); /* stacommonfrac */ + out_string = (*fmgr_faddr(&out_function)) (stats->best, stats->typelem, stats->attr->atttypmod); + values[i++] = PointerGetDatum(textin(out_string)); /* stacommonval */ + pfree(out_string); + out_string = (*fmgr_faddr(&out_function)) (stats->min, stats->typelem, stats->attr->atttypmod); + values[i++] = PointerGetDatum(textin(out_string)); /* staloval */ + pfree(out_string); + out_string = (char *) (*fmgr_faddr(&out_function)) (stats->max, stats->typelem, stats->attr->atttypmod); + values[i++] = PointerGetDatum(textin(out_string)); /* stahival */ + pfree(out_string); + + stup = heap_formtuple(sd->rd_att, values, nulls); + + /* ---------------- + * Watch out for oversize tuple, which can happen if + * all three of the saved data values are long. + * Our fallback strategy is just to not store the + * pg_statistic tuple at all in that case. (We could + * replace the values by NULLs and still store the + * numeric stats, but presently selfuncs.c couldn't + * do anything useful with that case anyway.) + * + * We could reduce the probability of overflow, but not + * prevent it, by storing the data values as compressed + * text; is that worth doing? The problem should go + * away whenever long tuples get implemented... + * ---------------- + */ + if (MAXALIGN(stup->t_len) <= MaxTupleSize) + { + /* OK, store tuple and update indexes too */ + Relation irelations[Num_pg_statistic_indices]; + + heap_insert(sd, stup); + CatalogOpenIndices(Num_pg_statistic_indices, Name_pg_statistic_indices, irelations); + CatalogIndexInsert(irelations, Num_pg_statistic_indices, sd, stup); + CatalogCloseIndices(Num_pg_statistic_indices, irelations); + } + + /* release allocated space */ + pfree(DatumGetPointer(values[Anum_pg_statistic_stacommonval - 1])); + pfree(DatumGetPointer(values[Anum_pg_statistic_staloval - 1])); + pfree(DatumGetPointer(values[Anum_pg_statistic_stahival - 1])); + heap_freetuple(stup); + } + } + } + heap_endscan(scan); + /* close rels, but hold locks till upcoming commit */ + heap_close(ad, NoLock); + heap_close(sd, NoLock); +} + +/* + * del_stats() -- delete pg_statistic rows for a relation + * + * If a list of attribute numbers is given, only zap stats for those attrs. + */ +static void +del_stats(Oid relid, int attcnt, int *attnums) +{ + Relation pgstatistic; + HeapScanDesc scan; + HeapTuple tuple; + ScanKeyData key; + + pgstatistic = heap_openr(StatisticRelationName, RowExclusiveLock); + + ScanKeyEntryInitialize(&key, 0x0, Anum_pg_statistic_starelid, + F_OIDEQ, ObjectIdGetDatum(relid)); + scan = heap_beginscan(pgstatistic, false, SnapshotNow, 1, &key); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, 0))) + { + if (attcnt > 0) + { + Form_pg_statistic pgs = (Form_pg_statistic) GETSTRUCT(tuple); + int i; + + for (i = 0; i < attcnt; i++) + { + if (pgs->staattnum == attnums[i] + 1) + break; + } + if (i >= attcnt) + continue; /* don't delete it */ + } + heap_delete(pgstatistic, &tuple->t_self, NULL); + } + + heap_endscan(scan); + + /* + * Close rel, but *keep* lock; we will need to reacquire it later, so + * there's a possibility of deadlock against another VACUUM process if + * we let go now. Keeping the lock shouldn't delay any common + * operation other than an attempted VACUUM of pg_statistic itself. + */ + heap_close(pgstatistic, NoLock); +} + + + -- 2.40.0