cde/doc/C/guides/man/man4/dtsrhanf.sgm

   1 <!-- $XConsortium: dtsrhanf.sgm /main/6 1996/09/08 20:19:48 rws $ -->
   2 <!-- (c) Copyright 1996 Digital Equipment Corporation. -->
   3 <!-- (c) Copyright 1996 Hewlett-Packard Company. -->
   4 <!-- (c) Copyright 1996 International Business Machines Corp. -->
   5 <!-- (c) Copyright 1996 Sun Microsystems, Inc. -->
   6 <!-- (c) Copyright 1996 Novell, Inc. -->
   7 <!-- (c) Copyright 1996 FUJITSU LIMITED. -->
   8 <!-- (c) Copyright 1996 Hitachi. -->
   9 <![ %CDE.C.CDE; [<RefEntry Id="CDE.INFO.dtsrhanfile">]]>
  10 <RefMeta>
  11 <RefEntryTitle>dtsrhanfile</RefEntryTitle>
  12 <ManVolNum>special file</ManVolNum>
  13 </RefMeta>
  14 <RefNameDiv>
  15 <RefName>dtsrhanfile</RefName>
  16 <RefPurpose>
  17 Describes the format and syntax of DtSearch han files
  18 </RefPurpose>
  19 </RefNameDiv>
  20 <RefSynopsisDiv>
  21 <Synopsis>
  22 <Symbol Role="Variable">filename</Symbol>.han
  23 </Synopsis>
  24 </RefSynopsisDiv>
  25 <RefSect1>
  26 <Title>DESCRIPTION</Title>
  27 <Para>Han files are the user generated profile files for <command>dtsrhan</command>.
  28 They identify fields in incoming text from which output fzk
  29 file fields can be constructed. The data from han files
  30 are loaded into memory by dtsrhan at initialization time.
  31 <command>dtsrhan</command> and han files have not been internationalized;
  32 han files may only contain ASCII characters.
  33 </para>
  34 <refsect2>
  35 <Title>General Format</Title>
  36 <para>All identifiers must begin with a letter, and must be composed entirely
  37 of alphanumerics and/or the underscore.
  38 </para>
  39 <para>Observe the following points when using using "strings":
  40 </para>
  41 <itemizedlist>
  42 <listitem>
  43 <para>If an identifying string contains quotes, use a backslash
  44 to create the quote. Example:
  45 </para>
  46 <programlisting>
  47 this string \"contains\" quotes
  48 </programlisting>
  49 <para>would find the string <literal>this string "contains" quotes</literal>.
  50 </para>
  51 </listitem>
  52 <listitem>
  53 <para>The above point makes it necessary to use double backslashes to create
  54 a single backslash. Example:
  55 </para>
  56 <programlisting>
  57 this string has a \\ backslash
  58 </programlisting>
  59 <para>would find the string <literal>this string has a \ backslash</literal>.
  60 </para>
  61 </listitem>
  62 <listitem>
  63 <para>Actually, using the backslash in any string will cause the next
  64 character to be included without exception. Thus, a string
  65 with <literal>this is \a test</literal> will end up being
  66 <literal>this is a test</literal>.
  67 The backslash is ignored, and the next character is imbedded
  68 in the string. This is only needed in the two cases described
  69 above, but can be used for any purpose.
  70 </para>
  71 </listitem>
  72 </itemizedlist>
  73 </refsect2>
  74 <refsect2>
  75 <Title>Individual Line Syntax</Title>
  76 <variablelist>
  77 <varlistentry><term># ... | blank line</term>
  78 <listitem>
  79 <para>Han file comment. Any line beginning with a pound sign
  80 in the first column, or any blank line, is discarded.
  81 </para>
  82 </listitem>
  83 </varlistentry>
  84 <varlistentry><term>line <emphasis>identifier</emphasis> = <emphasis>physical_line_number</emphasis></term>
  85 <listitem>
  86 <para>Defines a <literal>line</literal> with a physical line number in the record.
  87 <emphasis>physical_line_number</emphasis> must be a number.
  88 </para>
  89 </listitem>
  90 </varlistentry>
  91 <varlistentry><term>line <emphasis>identifier</emphasis> = column_number,"<emphasis>string</emphasis>" [<emphasis>column_number</emphasis>,"<emphasis>string</emphasis>"] ...</term>
  92 <listitem>
  93 <para>Defines a <literal>line</literal> using a column number and a
  94 'signature' string that should appear at that column.
  95 <emphasis>column_number</emphasis> can be a number, or
  96 <literal>*</literal> for 'any column'. "<emphasis>string</emphasis>"
  97 should be a string that occurs on the line in question. It is possible
  98 to define complex signatures using multiple clauses.
  99 </para>
 100 </listitem>
 101 </varlistentry>
 102 <varlistentry><term>field <emphasis>identifier</emphasis> = <emphasis>line_identifier</emphasis>,"<emphasis>string</emphasis>", <emphasis>offset</emphasis>, <emphasis>length</emphasis></term>
 103 <listitem>
 104 <para>Defines a <literal>field</literal> based on a declared line, a string
 105 found on that line, the offset from the first letter of the string, and
 106 the length of field.
 107 </para>
 108 <para><emphasis>line_identifier</emphasis> is an identifier declared with the
 109 <literal>line</literal> directive (see above).
 110 </para>
 111 <para>"<emphasis>string</emphasis>" is a string for relative positioning,
 112 where a field will follow a string that may not always occur in the
 113 same position on a line. If it is known that the field will always be
 114 in the same position, an empty string("") may be used.
 115 <emphasis>string</emphasis> must be enclosed in double quotes.
 116 <emphasis>offset</emphasis> must be a number, identifying the offset
 117 from the first character in the string. It starts at position 1, not 0,
 118 and may be negative.
 119 </para>
 120 <para><emphasis>length</emphasis> represents the length of the field. It may
 121 be a number, or it may be one of two special tokens:
 122 </para>
 123 <variablelist>
 124 <varlistentry><term><literal>eow</literal></term>
 125 <listitem>
 126 <para>End of word. The field will begin at <emphasis>offset</emphasis> and
 127 continue until the next white-space character.
 128 </para>
 129 </listitem>
 130 </varlistentry>
 131 <varlistentry><term><literal>eoln</literal></term>
 132 <listitem>
 133 <para>End of line. The field will begin at <emphasis>offset</emphasis> and
 134 continue to the end of the line.
 135 </para>
 136 </listitem>
 137 </varlistentry>
 138 </variablelist>
 139 <para>An identifier <emphasis>string</emphasis> beginning with 3 uppercase M's
 140 ("MMM...") will be considered an English month name string.
 141 At run time, if the first 3 chars of the field's value
 142 equal the first three chars of an English month name,
 143 the value string will be translated to a two character
 144 string of digits in the range "01" to "12".
 145 For example, if field <emphasis>MMMmymonth</emphasis> had an original value of
 146 "April ", it will be translated to "04" before use.
 147 </para>
 148 <para>In the case where a <literal>line</literal> identifier is associated with
 149 multiple lines in a single document, the field value will
 150 be determined from the last occurrence of the line within
 151 the record.
 152 </para>
 153 </listitem>
 154 </varlistentry>
 155 <varlistentry><term>constant <emphasis>identifier</emphasis> = "<emphasis>string</emphasis>"</term>
 156 <listitem>
 157 <para>Defines a <literal>constant</literal> field that can be used in
 158 abstracts and keys. The <emphasis>identifier</emphasis> is defined
 159 exactly the same as a <literal>field</literal> identifier. The value
 160 must be enclosed in double quotes.
 161 </para>
 162 </listitem>
 163 </varlistentry>
 164 <varlistentry><term>date = null | <emphasis>field_id</emphasis> [+ <emphasis>field_id</emphasis>] ...</term>
 165 <listitem>
 166 <para>Defines the document date for each document. It will
 167 be converted into a correctly formatted fzk file date line.
 168 </para>
 169 <para><literal>null</literal> specifies undated documents. Undated documents
 170 always qualify for searches irrespective of date
 171 qualifiers in <function>DtSearchQuery</function>.
 172 </para>
 173 <para><emphasis>field_id</emphasis> is an identifier declared using the <literal>field</literal>
 174 or <literal>constant</literal> directives (see above).
 175 "MMM" fields are often useful for date assemblies.
 176 </para>
 177 <para>Multiple fields may be concatenated into a date.
 178 </para>
 179 <para>After concatenation, the assembled date must be of the following format:
 180 <emphasis>YYYYMMDDhhmm</emphasis> (exactly 12 digits). For example,
 181 <literal>199404171701</literal> is April 17, 1994 at 5:01 pm.
 182 <literal>200405031000</literal> is May 3, 2004, at 10:00 am (10
 183 o'oclock).
 184 </para>
 185 <para>Dates before 1900 or after 5995 are invalid.
 186 </para>
 187 <para>If <literal>date</literal> is not specified or is invalid, a generated date
 188 based on the current date and time will be used, but an
 189 invalid <literal>date</literal> will also generate an error message.
 190 </para>
 191 </listitem>
 192 </varlistentry>
 193 <varlistentry><term>key = <emphasis>field_id</emphasis> [+ <emphasis>field_id</emphasis>] ... | time | count</term>
 194 <listitem>
 195 <para>Defines the unique database key for each record in a fzk file.
 196 </para>
 197 <para><emphasis>field_id</emphasis> is a field identifier declared using the
 198 <literal>field</literal> or <literal>constant</literal> directives.
 199 </para>
 200 <para>Multiple fields may be concatenated into a key.
 201 </para>
 202 <para><literal>time</literal> is a special keyword used to generate keys based
 203 on the current run date and time, plus a sequential count suffix.
 204 </para>
 205 <para><literal>count</literal> is a special keyword used to generate keys
 206 based on a sequential count of records.
 207 </para>
 208 </listitem>
 209 </varlistentry>
 210 <varlistentry><term>upper</term>
 211 <listitem>
 212 <para>Specifies that keys written by handel are to be entirely converted
 213 to upper case. Without using this directive, mixed-case keys
 214 are allowed.
 215 </para>
 216 </listitem>
 217 </varlistentry>
 218 <varlistentry><term>keychar = A | B | ...Z</term>
 219 <listitem>
 220 <para>Defines the character used to categorize keys for DtSearch. It
 221 must be an uppercase ASCII alphabetic character.
 222 </para>
 223 </listitem>
 224 </varlistentry>
 225 <varlistentry><term>delimiter = <emphasis>line_identifier</emphasis>, bottom</term>
 226 <listitem>
 227 <para>Defines the end of text (ETX) delimiter that will separate records.
 228 </para>
 229 <para><emphasis>line_identifier</emphasis> is an identifier declared with the
 230 <literal>line</literal> directive.
 231 </para>
 232 <para><literal>bottom</literal> is required. It specifies that the ETX will
 233 occur at the bottom of each record. Top of record delimiters are not
 234 supported.
 235 </para>
 236 </listitem>
 237 </varlistentry>
 238 <varlistentry><term>image = all | none</term>
 239 <listitem>
 240 <para>Defines whether the document image retrieved by
 241 <function>DtSearchRetrieve</function> is to contain all or none of the
 242 record, prior to application of <literal>imageinclude</literal> or
 243 <literal>imageexclude</literal> directives later in the han file. It
 244 defaults to <literal>all</literal>.
 245 </para>
 246 </listitem>
 247 </varlistentry>
 248 <varlistentry><term>imageinclude = <emphasis>line_identifier</emphasis> [- <emphasis>line_identifier</emphasis>]</term>
 249 <listitem>
 250 <para>Defines a line (or range of lines) to be included in the image.
 251 <emphasis>line_identifier</emphasis> is an identifier declared with the
 252 <literal>line</literal> directive.
 253 </para>
 254 </listitem>
 255 </varlistentry>
 256 <varlistentry><term>imageexclude = <emphasis>line_identifier</emphasis> [- <emphasis>line_identifier</emphasis>]</term>
 257 <listitem>
 258 <para>Defines a line (or range of lines) to be excluded from the image.
 259 <emphasis>line_identifier</emphasis> is an identifier declared with the
 260 <literal>line</literal> directive.
 261 </para>
 262 </listitem>
 263 </varlistentry>
 264 <varlistentry><term>abstract = field(s) <emphasis>field_identifier</emphasis> [+ <emphasis>field_identifier</emphasis>]...</term>
 265 <listitem>
 266 <para>Defines the abstract to be placed into the fzk file. It is created from
 267 the concatenations of fields. <emphasis>field_identifier</emphasis> is
 268 an identifier declared with the <literal>field</literal> directive.
 269 </para>
 270 </listitem>
 271 </varlistentry>
 272 <varlistentry><term>delblanklines = true | false</term>
 273 <listitem>
 274 <para>Determines if blank lines are to be removed from the record image or
 275 not. It defaults to <literal>false</literal>.
 276 </para>
 277 </listitem>
 278 </varlistentry>
 279 </variablelist>
 280 </refsect2>
 281 <refsect2>
 282 <Title>Example</Title>
 283 <para>The sample han file shown here describes a text file containing a
 284 concatenated set of man pages documents.
 285 </para>
 286 <programlisting>
 287 # All records in the incoming text file are delimited by the same
 288 # end of text convention as the default for an fzk file, namely
 289 # a linefeed (control-L) on a line by itself ("\f\n").
 290 # Define a line named "etx" with that description,
 291 # and declare it to be the &lt;delimiter>.
 292 # Note that there must be a real ASCII control-L character between
 293 # the quotes in the line below.
 294 line etx = *,"^L"
 295 delimiter = etx, bottom
 296
 297 # The command name that the man page is describing is on the first line.
 298 # To access it we need to define a line directive for line number 1.
 299 line line1 = 1
 300
 301 # The name of the man page command begins in column 3 of line 1,
 302 # and the length is variable.  So we define a field identifier
 303 # named "command1" from column 3 to the end of the word.
 304 field command1 = line1,"",3,eow
 305
 306 # We want each document abstract to have a constant prefix
 307 # followed by the name of the command.
 308 constant preabs = "Man Pages for "
 309 abstract = fields preabs + command1
 310
 311 # We want all keys to be the name of the command, prefixed with
 312 # the same identifying character, an uppercase M.
 313 keychar = M
 314 key = command1
 315
 316 # We want the each document date to be equivalent to the release
 317 # date of the original man pages, which we choose here to hard code
 318 # as November 1, 1994, at 1 o'clock in the afternoon.
 319 constant datecons = "199411011300"
 320 date = datecons
 321 </programlisting>
 322 </refsect2>
 323 </refsect1>
 324 <RefSect1>
 325 <Title>SEE ALSO</Title>
 326 <Para>&cdeman.dtsrhan;,
 327 &cdeman.dtsrindex;,
 328 &cdeman.dtsrfzkfiles;,
 329 &cdeman.dtsrlangfiles;,
 330 &cdeman.DtSearch;
 331 </Para>
 332 </RefSect1>
 333 </RefEntry>