hx.c

(plain)

    1 /* HX
    2 ** A simplified, dependency-less, and hopefully portable utility for converting
    3 ** data into a hexadecimal representation.                                (C99)
    4 ** ----------------------------------------------------------------------------
    5 ** Copyright (c) 2022, 2023 Amelia Zabardast Ziabari
    6 **
    7 ** Redistribution and use in source and binary forms, with or without
    8 ** modification, are permitted provided that the following conditions are met:
    9 **
   10 **  1. Redistributions of source code must retain the above copyright notice,
   11 **     this list of conditions and the following disclaimer.
   12 **  2. Redistributions in binary form must reproduce the above copyright
   13 **     notice, this list of conditions and the following disclaimer in the
   14 **     documentation and/or other materials provided with the distribution.
   15 **
   16 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   17 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   20 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   21 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   22 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   23 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   24 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   25 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   26 ** POSSIBILITY OF SUCH DAMAGE.
   27 ** ----------------------------------------------------------------------------
   28 ** What's New?
   29 **
   30 ** (2.0.2) SHX -> HX
   31 **
   32 ** (2.0.1) Re-wrote the argument parser to be more lenient, and corrected a bug
   33 **         that made the last line of the pretty print output sometimes have
   34 **         the wrong address.
   35 **
   36 ** (2.0.0) Refactored large parts of the program.
   37 **
   38 ** (1.1.3) Moved the ASCII and hex character strings into the same variable in
   39 **         order to waste less space.
   40 **
   41 ** (1.1.2) Made the input size counter display clearer in the pretty printed
   42 **         output by prefixing it, aligning it and giving it a unit.
   43 **
   44 ** (1.1.0) Fixed a few C standard compatibility issues that were making the
   45 **         program fail to compile using tools such as lcc-win32.  Mainly
   46 **         stopped using the __VA_ARGS__ version of the RUN_ONCE macro, as it
   47 **         causes issues with some compilers.
   48 */
   49 #include <stdio.h>
   50 #include <string.h>
   51 #include <stdlib.h>
   52 #include <errno.h>
   53 
   54 
   55 /* --- CONFIGURATION ------------------------------------------------------- */
   56 
   57 
   58 #define VERSION "2.0.2"
   59 
   60 /* Changing this value guarantees that each row of the pretty-printed output
   61 ** will have exactly 16 8-bit hexidecimal values, which is common in many other
   62 ** utilities with similar functionality.  This must be a power of 2 (subtracted
   63 ** by one, e.g (1 << 4) - 1 == 15, the default).
   64 */
   65 #define LINE_MAX 15
   66 
   67 /* This value determines the byte-counter type. It needs to be something
   68 ** relatively large to handle absolutely yuuge inputs. A 64-bit max is probably
   69 ** fine, given that we only print 48-bit addresses.
   70 */
   71 #define CT_TYPE size_t
   72 #define CT_PRINT "zu"
   73 
   74 
   75 /* --- HELPER MACROS ------------------------------------------------------- */
   76 
   77 
   78 /* There are many ways to check for Windows, so do them all here. */
   79 #if defined(_WIN32) || defined(WIN32) || defined(__WIN32__)
   80 # define IS_WINDOWS
   81 #endif
   82 
   83 /* We need a macro to change the mode of the standard input to binary, or else
   84 ** platforms like Windows will do annoying things to the input without asking.
   85 */
   86 #ifdef IS_WINDOWS
   87 # include <io.h>
   88 # include <fcntl.h>
   89 # define SET_BINARY_MODE(handle) _setmode(_fileno(handle), _O_BINARY)
   90 #else
   91 # define SET_BINARY_MODE(handle)
   92 #endif
   93 
   94 /* This defines a macro for function inlining hints, because the compilers that
   95 ** support it usually also implement GNU C extensions generally.
   96 */
   97 #ifdef __GNUC__
   98 # if __GNUC__ < 5
   99 #  define __poly_inline inline
  100 # else
  101 #  define __poly_inline inline __attribute__((hot))
  102 # endif
  103 #else
  104 # define __poly_inline
  105 #endif
  106 
  107 #define ARG_ERR(...) do { printf(__VA_ARGS__); return 1; } while (0)
  108 
  109 /* Effectively provides a "do once" construct. */
  110 #define __once_l do { static int ro = 0; if (!ro) { ro = 1; do
  111 #define __once_r while (0); } } while (0)
  112 
  113 /* The setmode() trick doesn't work for some Windows compiler environments,
  114 ** namely Digital Mars, so I just implemented getchar() manually using the
  115 ** typical Windows API functions.
  116 */
  117 #ifdef IS_WINDOWS
  118 # include <windows.h>
  119 static int __poly_getchar()
  120         {
  121         static HANDLE wstdin;
  122         static DWORD wbr;
  123         static unsigned char wstdinbuf[1];
  124 
  125         __once_l
  126                 {
  127                 wstdin = GetStdHandle(STD_INPUT_HANDLE);
  128                 } __once_r;
  129         ReadFile(wstdin, wstdinbuf, 1, &wbr, NULL);
  130         if (!wbr) return EOF;
  131         return wstdinbuf[0];
  132         }
  133 #else
  134 # define __poly_getchar() getchar()
  135 #endif
  136 
  137 
  138 /* --- MAIN PROGRAM -------------------------------------------------------- */
  139 
  140 
  141 static int pretty_buf[LINE_MAX + 1];
  142 static int     ctr = 0;
  143 static CT_TYPE ct  = 0;
  144 
  145 static int pretty = 0, count = 0, help = 0, rlimit = 0,
  146            ascii  = 0, limit = 0, cols = 0, skip   = 0;
  147 
  148 static const char *readme =
  149         "hx (" VERSION ")"                                                 "\n"
  150         "Copyright (c) 2022, 2023 Amelia Zabardast Ziabari"              "\n\n"
  151         "%s [OPTION...] [FILENAME]"                                      "\n\n"
  152         "Where OPTION is one or more of the following:"                    "\n"
  153         "  -p        use formal pretty-printing instead of raw output"     "\n"
  154         "  -c        print the total length in octets"                     "\n"
  155         "  -h        print this help information"                          "\n"
  156         "  -sXXXXX   limit the number of octets to read (after skipping)"  "\n"
  157         "  -jXXXXX   skip a certain number of octets"                      "\n"
  158         "            (faster when using FILENAME instead of stdin)"        "\n"
  159         "  -a        when in raw output, print only filtered ASCII"        "\n"
  160         "  -l[XXX]   when in raw output, limit the output columns"         "\n"
  161         "            (the default value is 80)"                          "\n\n"
  162         "The FILENAME is optional, if it is not provided, stdin is used." "\n";
  163 
  164 /* This table of ASCII characters is used both when printing the ASCII column
  165 ** of the pretty-printed output AND the hexadecimal values.  For the former, a
  166 ** strict order of the characters is not required, so for the latter we can
  167 ** simply make sure the first 16 characters are the ones required for printing
  168 ** hex values in the correct order in order to save on binary size.
  169 **
  170 ** Technically it might be easier to just have a lookup table for every 8-bit
  171 ** hexadecimal value.  However, it's not like it would massively improve the
  172 ** performance or something like that, and it would result in a larger binary,
  173 ** (even if not by much in today's standards), so it's not really worth it even
  174 ** if doing it this way instead requires some smarter code.
  175 */
  176 static const char *ascii_chars =
  177         "0123456789"
  178         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  179         "abcdefghijklmnopqrstuvwxyz"
  180         "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ";
  181 
  182 
  183 static __poly_inline void ascii_print(int c)
  184         {
  185         if (strchr(ascii_chars, c) != NULL && c != 0)
  186                 putchar(c);
  187         else
  188                 putchar('.');
  189         }
  190 
  191 
  192 /* This is a crude way to check how many bytes are required to store an
  193 ** unsigned integer, to check for overflow.
  194 */
  195 static __poly_inline int bytes_req(CT_TYPE val)
  196         {
  197         int bytes = 0;
  198         while (val != 0)
  199                 {
  200                 val >>= 8;
  201                 bytes++;
  202                 }
  203         return bytes;
  204         }
  205 
  206 
  207 /* There appear to be some weird bugs in the standard C library on some
  208 ** platforms that break the handling of long long types in the printf family
  209 ** of functions.  I'm not sure why, but to remedy it I've written my own code
  210 ** to handle printing hexadecimal numbers in this function.  It also handles
  211 ** overflow, so if a number is too big to be represented, it will just blank it
  212 ** out with asterisks instead of messing up the alignment of the output.
  213 */
  214 static void hex_print(CT_TYPE val, int pad, const char *add)
  215         {
  216 #define ADD_BUFFER 8
  217 
  218         /* Create a buffer capable of holding the maximum printable size. */
  219         static char fmt[sizeof(CT_TYPE) * 2 + 1 + ADD_BUFFER] = {0};
  220 
  221         /* Calculates the overflow based on the available padding. */
  222         int i, overflow = bytes_req(val) > pad;
  223 
  224         /* Pad the empty space with zeros.  If val too large to print, fill the
  225         ** buffer with a warning character and then skip to printing.
  226         */
  227         memset(fmt, overflow ? '*' : '0', pad <<= 1);
  228         fmt[pad--] = 0;
  229 
  230         if (overflow) goto overflow_skip;
  231 
  232         /* Iterate over the padded space and map each index to a byte out of
  233         ** the input value, and then convert that byte to a hexadecimal string
  234         ** which can be pasted into the padding space.
  235         **/
  236         for (i = 0; i <= pad; i += 2)
  237                 {
  238                 unsigned char byte = ((unsigned char *) &val)[i >> 1];
  239                 /* "pad - i" makes sure it starts writing from the end of the
  240                 ** value instead of the start, ensuring a correct order for
  241                 ** inputs larger than 8 bits.
  242                 */
  243                 fmt[pad - i - 1] = ascii_chars[byte >> 4];
  244                 fmt[pad - i    ] = ascii_chars[byte & 15];
  245                 }
  246 
  247 overflow_skip:
  248         /* XXX: It may be faster to do things this way than to concatenate the
  249         ** strings with printf.  Unsure?
  250         */
  251         if (strlen(add) < ADD_BUFFER) strcat(fmt, add);
  252         fputs((const char *) fmt, stdout);
  253         }
  254 
  255 
  256 static void dump_pretty_buf(int exit)
  257         {
  258         /* Only print anything if we're at the end of one pretty output line.
  259         ** If we just broke out of the getchar loop, then there might be some
  260         ** bytes left in the buffer that didn't quite fill a line, so we will
  261         ** print it as long as the loop left the buffer unfinished.
  262         */
  263         int e, pad, unfin = ctr == LINE_MAX;
  264 
  265         if (exit ? unfin : !unfin) return;
  266 
  267         /* Print the address as a 48-bit hex integer, assuming a max size of
  268         ** 256 TiB, just in case the counter type changes at some point.
  269         */
  270         hex_print(ct - ctr, 6, ": ");
  271 
  272         /* Prints the actual prettified hex view. */
  273         for (e = 0; e <= LINE_MAX; e++)
  274                 {
  275                 /* Prints value if buffer slot filled, else do padding. */
  276                 pad = !(e & 1);
  277                 if (e <= ctr) hex_print(pretty_buf[e], 1, pad ? "" : " ");
  278                 else fputs(pad ? "  " : "   ", stdout);
  279                 }
  280         fputs("| ", stdout);
  281 
  282         /* Only used buffer slots are iterated to print ASCII values, as we
  283         ** have no need to add padding for any further columns.
  284         */
  285         for (e = 0; e <= ctr; e++)
  286                 ascii_print(pretty_buf[e]);
  287         putchar('\n');
  288         }
  289 
  290 
  291 int main(int argc, char** argv)
  292         {
  293         int c, argn;
  294         char *filen = NULL;
  295         FILE *fd;
  296 
  297         /* This is a grossly complex argument parser intended to handle many
  298         ** plausible edge cases and scenarios.
  299         ** TODO: Either document this or replace it with getopt?
  300         */
  301         for (argn = 1; argn < argc; argn++)
  302                 {
  303                 if (strchr(argv[argn], '-') == argv[argn])
  304                         {
  305 #define STRCH_ARG(a) strchr(argv[argn], a)
  306 #define CHECK_ARG(a) if (STRCH_ARG(a) != NULL)
  307                         CHECK_ARG('p') pretty = 1;
  308                         CHECK_ARG('c') count  = 1;
  309                         CHECK_ARG('h') help   = 1;
  310                         CHECK_ARG('a') ascii  = 1;
  311                         CHECK_ARG('l') limit  = 1;
  312 #define GO_INT(v, a)       v = (int) strtol(STRCH_ARG(a) + 1, NULL, 0)
  313 #define GO_CHK(v)          if (errno == ERANGE || v < 1)
  314 #define GO_VAL(a)          if (*(STRCH_ARG(a) + 1) != '\0')
  315 #define GO_ER1(v, s)       ARG_ERR("invalid " s " specified (%d)\n", v)
  316 #define GO_ER2(s)          ARG_ERR("no " s " specified\n")
  317 #define GO_DUP(a, v, s)    GO_VAL(a) \
  318         { GO_INT(v, a); GO_CHK(v) { GO_ER1(v, s); } }
  319 #define GO_DEF(a, v, s, d) CHECK_ARG(a) { GO_DUP(a, v, s) else { v = d; } }
  320 #define GO_REQ(a, v, s)    CHECK_ARG(a) { GO_DUP(a, v, s) else { GO_ER2(s); } }
  321                         GO_DEF('l', cols,   "column limit", 80);
  322                         GO_REQ('s', rlimit, "read limit");
  323                         GO_REQ('j', skip,   "data skip");
  324                         }
  325                 else if (argn == argc - 1)
  326                         {
  327                         filen = argv[argn];
  328                         }
  329                 }
  330         if (!ascii) cols >>= 1;
  331         if (filen)
  332                 {
  333                 if (!(fd = fopen(filen, "rb")))
  334                         ARG_ERR("could not open file (%s)\n", filen);
  335                 else if (skip)
  336                         {
  337                         fseek(fd, skip, SEEK_SET);
  338                         skip = 0;
  339                         }
  340                 }
  341         else
  342                 SET_BINARY_MODE(stdin);
  343         if (help)
  344                 {
  345                 printf(readme, argv[0]);
  346                 return 1;
  347                 }
  348         while ((c = filen ? fgetc(fd) : __poly_getchar()) != EOF)
  349                 {
  350                 /* If we need to skip, we just re-use count mode and then reset
  351                 ** the stream statistics and disable skip mode once the target
  352                 ** has been reached.
  353                 */
  354                 if (skip && ct < (size_t) skip) goto c_only;
  355                 else __once_l
  356                         {
  357                         ct   = 0;
  358                         skip = 0;
  359                         } __once_r;
  360 
  361                 if (rlimit && ct >= (size_t) rlimit) break;
  362 
  363                 if (!pretty)
  364                         {
  365                         /* Count mode doesn't hide the output if pretty. */
  366                         if (count) goto c_only;
  367 
  368                         /* Both of these are raw output with no spaces, etc */
  369                         if (!ascii) hex_print(c, 1, "");
  370                         else ascii_print(c);
  371 
  372                         /* Print a newline if we've handled the last character
  373                         ** in a column.  Similar logic to CTR in pretty mode.
  374                         */
  375                         if (limit && ct % cols == cols - 1)
  376                                 putchar('\n');
  377                         }
  378                 else
  379                         {
  380                         ctr = ct & LINE_MAX;
  381                         pretty_buf[ctr] = c;
  382 
  383                         /* This can safely be called for each byte, because the
  384                         ** check to see if we're at the end of a line happens
  385                         ** within the function itself.
  386                         */
  387                         dump_pretty_buf(0);
  388                         }
  389 c_only:
  390                 ct++;
  391                 }
  392         if (fd) fclose(fd);
  393         if (pretty)
  394                 {
  395                 ct--;
  396                 dump_pretty_buf(1);
  397                 if (count)
  398                         printf("       TOTAL: %" CT_PRINT " OCTETS\n", ++ct);
  399                 }
  400         else if (count)
  401                 printf("%" CT_PRINT "\n", ct);
  402         else
  403                 printf("\n");
  404 
  405         return 0;
  406         }