From: PatR Date: Fri, 26 Nov 2021 01:57:37 +0000 (-0800) Subject: get_rnd_text() selection distribution X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=59beebcbcc1f4a07ff6056fb0b2c5010fefad90e;p=nethack get_rnd_text() selection distribution While testing 'the("Capitalized Hallucinatory Monster")' I noticed that some hallucinatory monsters showed up more often than others. When the random engravings, epitaphs, and bogus monsters were converted from hard-coded arrays to data files accessed by random seek (3.6.0), they became subject to the same distribution irregularites that rumors suffer from. The chance that an entry will be chosen depends upon the chance that a random seek will hit somewhere in the line which precedes it, so entries that follow long lines are more likely to be chosen and entries that follow short lines are less likely. We improved that for rumors by having makedefs pad the shortest lines. Distribution still isn't uniform but is much better than it was (and could be further improved with a longer padding length at the cost of making data files bigger so possibly slower to access; both overall size and access speed mattered back when floppy discs were supported but are probably irrelevant now). Start doing the same thing for the newer files: pad the shortest lines to increase the chance that seek will find them. The tradeoff is that the data files become bigger. Rumors, engravings, and epitaphs lines are all at least 60 characters now; bogus monsters are at least 20. These are the data file sizes I see (in bytes: old, new; padding for rumors was already in use so its size hasn't changed): bogusmon 4449 7211 engrave 1326 2894 epitaph 14159 24075 rumors 49173 49173 The only place that padding is noticeable in-game is #wizrumorcheck. --- diff --git a/doc/fixes37.0 b/doc/fixes37.0 index 5adf3188e..1c6d86c9f 100644 --- a/doc/fixes37.0 +++ b/doc/fixes37.0 @@ -695,6 +695,12 @@ assigning a fruit name that matches the name of an artifact which doesn't use any "the" prefix could yield messages showing "the Artifact" when dealing with the artifact rather than fruit: "You are blasted by _the_ Excalibur's power!"; didn't impact basic inventory formatting +selection of random engravings, epitaphs, and hallucinatory monster names had + the same problem that rumor selection used to have: entries which + follow longer than average lines are most likely to be chosen and + ones which follow shorter than average lines are least likely; use + same workaround as for rumors: pad the shortest lines; result isn't + uniforn distribution but is better (tradeoff vs size; see makedefs) Fixes to 3.7.0-x Problems that Were Exposed Via git Repository diff --git a/src/rumors.c b/src/rumors.c index 39985734b..b70ced340 100644 --- a/src/rumors.c +++ b/src/rumors.c @@ -40,6 +40,7 @@ * and placed there by 'makedefs'. */ +static void unpadline(char *); static void init_rumors(dlb *); static void init_oracles(dlb *); static void others_check(const char *ftype, const char *, winid *); @@ -51,6 +52,24 @@ static void init_CapMons(void); static unsigned CapMonSiz = 0; static const char **CapMons = 0; +/* makedefs pads short rumors, epitaphs, engravings, and hallucinatory + monster names with trailing underscores; strip those off */ +static void +unpadline(char *line) +{ + char *p = eos(line); + + /* remove newline if still present; caller should have stripped it */ + if (p > line && p[-1] == '\n') + --p; + + /* remove padding */ + while (p > line && p[-1] == '_') + --p; + + *p = '\0'; +} + DISABLE_WARNING_FORMAT_NONLITERAL static void @@ -158,21 +177,7 @@ getrumor( g.true_rumor_size = -1; /* don't try to open it again */ } - /* this is safe either way, so do it always since we can't get the - * definition out of makedefs.c - */ -#define PAD_RUMORS_TO -#ifdef PAD_RUMORS_TO - /* remove padding */ - { - char *x = eos(rumor_buf) - 1; - - while (x > rumor_buf && *x == '_') - x--; - *++x = '\n'; - *x = '\0'; - } -#endif + unpadline(rumor_buf); return rumor_buf; } @@ -426,8 +431,9 @@ get_rnd_text(const char* fname, char* buf, int (*rng)(int)) (void) dlb_fgets(line, sizeof line, fh); } if ((endp = index(line, '\n')) != 0) - *endp = 0; + *endp = '\0'; Strcat(buf, xcrypt(line, xbuf)); + unpadline(buf); /* strip padding that makedefs added to end of line */ (void) dlb_fclose(fh); } else { couldnt_open_file(fname); diff --git a/util/makedefs.c b/util/makedefs.c index 6af03358e..800d2f935 100644 --- a/util/makedefs.c +++ b/util/makedefs.c @@ -104,14 +104,29 @@ static const char SCCS_Id[] UNUSED = "@(#)makedefs.c\t3.7\t2020/01/18"; #endif /* else !MAC */ #endif /* else !AMIGA */ +/* + * For files where entries are selected by seeking to a random position, + * skipping to newline, and then using the next line, lines that follow + * long ones are more likely to be selected than average and lines that + * follow short ones are less likely to be selected than average. + * We make selection be more evenly distributed by padding the shortest + * lines, at the cost of making the data files bigger. The larger these + * values are, the more uniform the selection will become but the more + * space will be needed for data used for something which is relatively + * inconsequential to actual game play. A value of 0 would suppress the + * padding because every line is already at least that long. + */ +#define PAD_RUMORS_TO 60u /* also used for epitaphs and engravings */ +#define PAD_BOGONS_TO 20u /* hallucinatory monsters */ + static const char *Dont_Edit_Code = "/* This source file is generated by 'makedefs'. Do not edit. */\n", *Dont_Edit_Data = "#\tThis data file is generated by 'makedefs'. Do not edit. \n", *Reference_file = - "/* This file is generated by 'makedefs' for reference purposes only. */\n" - "/* It is no longer used in the NetHack build. */\n"; + "/* This file is generated by 'makedefs' for reference purposes only. */\n" + "/* It is no longer used in the NetHack build. */\n"; static struct version_info version; @@ -151,9 +166,10 @@ static char *name_file(const char *, const char *); static FILE *getfp(const char *, const char *, const char *, int); static void do_ext_makedefs(int, char **); static char *xcrypt(const char *); +static char *padline(char *, unsigned); static unsigned long read_rumors_file(const char *, int *, - long *, unsigned long); -static void do_rnd_access_file(const char *, const char *); + long *, unsigned long, unsigned); +static void do_rnd_access_file(const char *, const char *, unsigned); static boolean d_filter(char *); static boolean h_filter(char *); static void opt_out_words(char *, int *); @@ -312,15 +328,17 @@ do_makedefs(char *options) */ do_rnd_access_file(EPITAPHFILE, /* default epitaph: parody of the default engraving */ - "No matter where I went, here I am."); + "No matter where I went, here I am.", + PAD_RUMORS_TO); do_rnd_access_file(ENGRAVEFILE, /* default engraving: popularized by "The Adventures of Buckaroo Bonzai Across the 8th Dimenstion" but predates that 1984 movie; some attribute it to Confucius */ - "No matter where you go, there you are."); + "No matter where you go, there you are.", + PAD_RUMORS_TO); do_rnd_access_file(BOGUSMONFILE, /* default bogusmon: iconic monster that isn't in nethack */ - "grue"); + "grue", PAD_BOGONS_TO); break; case 'h': case 'H': @@ -867,11 +885,45 @@ xcrypt(const char* str) return buf; } -#define PAD_RUMORS_TO 60 +static char * +padline(char *line, unsigned padlength) +{ + /* + * Rumor selection is accomplished by seeking to a random + * position in the file, advancing to newline, and taking + * the next line. Therefore, rumors which follow long-line + * rumors are most likely to be chosen and rumors which + * follow short-line rumors are least likely to be chosen. + * We ameliorate the latter by padding the shortest lines, + * increasing the chance of the random seek landing in them. + * + * Random epitaphs, engravings, and hallucinatory monster + * names are in the same boat. + */ + char *endp; + unsigned len = (unsigned) strlen(line); /* includes newline */ + + if (len <= padlength) { + endp = index(line, '\n'); + + /* this is only safe because fgetline() overallocates */ + while (len++ < padlength) { + *endp++ = '_'; + } + *endp++ = '\n'; + *endp = '\0'; + } + return line; +} + /* common code for do_rumors(). Return 0 on error. */ static unsigned long -read_rumors_file(const char* file_ext, int* rumor_count, - long* rumor_size, unsigned long old_rumor_offset) +read_rumors_file( + const char *file_ext, + int *rumor_count, + long *rumor_size, + unsigned long old_rumor_offset, + unsigned padlength) { char infile[MAXFNAMELEN]; char *line; @@ -886,26 +938,8 @@ read_rumors_file(const char* file_ext, int* rumor_count, /* copy the rumors */ while ((line = fgetline(ifp)) != 0) { -#ifdef PAD_RUMORS_TO - /* rumor selection is accomplished by seeking to a random - position in the file, advancing to newline, and taking - the next line; therefore, rumors which follow long-line - rumors are most likely to be chosen and rumors which - follow short-line rumors are least likely to be chosen; - we ameliorate the latter by padding the shortest lines, - increasing the chance of the random seek landing in them */ - int len = (int) strlen(line); - - if (len <= PAD_RUMORS_TO) { - char *base = index(line, '\n'); - /* this is only safe because fgetline() overallocates */ - while (len++ < PAD_RUMORS_TO) { - *base++ = '_'; - } - *base++ = '\n'; - *base = '\0'; - } -#endif + (void) padline(line, padlength); /* make shortest lines be longer */ + (*rumor_count)++; #if 0 /*[if we forced binary output, this would be sufficient]*/ @@ -928,7 +962,10 @@ read_rumors_file(const char* file_ext, int* rumor_count, } static void -do_rnd_access_file(const char* fname, const char* deflt_content) +do_rnd_access_file( + const char *fname, + const char *deflt_content, + unsigned padlength) { char *line, buf[BUFSZ]; @@ -948,15 +985,14 @@ do_rnd_access_file(const char* fname, const char* deflt_content) exit(EXIT_FAILURE); } Fprintf(ofp, "%s", Dont_Edit_Data); - /* lines from the file include trailing newline so make sure that the - default one does too */ - if (!index(deflt_content, '\n')) - deflt_content = strcat(strcpy(buf, deflt_content), "\n"); /* write out the default content entry unconditionally instead of waiting to see whether there are no regular output lines; if it matches a regular entry (bogusmon "grue"), that entry will become more likely to be picked than normal but it's nothing to worry about */ - (void) fputs(xcrypt(deflt_content), ofp); + Strcpy(buf, deflt_content); + if (!index(buf, '\n')) /* lines from the file include trailing newline +*/ + Strcat(buf, "\n"); /* so make sure that the default one does too */ + (void) fputs(xcrypt(padline(buf, padlength)), ofp); tfp = getfp(DATA_TEMPLATE, "grep.tmp", WRTMODE, FLG_TEMPFILE); grep0(ifp, tfp, FLG_TEMPFILE); @@ -966,8 +1002,10 @@ do_rnd_access_file(const char* fname, const char* deflt_content) ifp = getfp(DATA_TEMPLATE, "grep.tmp", RDTMODE, 0); #endif while ((line = fgetline(ifp)) != 0) { - if (line[0] != '#' && line[0] != '\n') + if (line[0] != '#' && line[0] != '\n') { + (void) padline(line, padlength); (void) fputs(xcrypt(line), ofp); + } free((genericptr_t) line); } Fclose(ifp); @@ -1019,13 +1057,15 @@ do_rumors(void) /* record the current position; true rumors will start here */ true_rumor_offset = ftell(tfp); - false_rumor_offset = read_rumors_file( - ".tru", &true_rumor_count, &true_rumor_size, true_rumor_offset); + false_rumor_offset = read_rumors_file(".tru", &true_rumor_count, + &true_rumor_size, true_rumor_offset, + PAD_RUMORS_TO); if (!false_rumor_offset) goto rumors_failure; eof_offset = read_rumors_file(".fal", &false_rumor_count, - &false_rumor_size, false_rumor_offset); + &false_rumor_size, false_rumor_offset, + PAD_RUMORS_TO); if (!eof_offset) goto rumors_failure;