shx.c

    1 /* Simple HeX (SHX)
    2 ** A simplified, dependency-less, and hopefully portable utility for converting
    3 ** data into a hexadecimal representation.                                (C99)
    4 ** ----------------------------------------------------------------------------
    5 ** Copyright (c) 2022, 2023 Amelia Zabardast Ziabari
    6 **
    7 ** Redistribution and use in source and binary forms, with or without
    8 ** modification, are permitted provided that the following conditions are met:
    9 **
   10 **  1. Redistributions of source code must retain the above copyright notice,
   11 **     this list of conditions and the following disclaimer.
   12 **  2. Redistributions in binary form must reproduce the above copyright
   13 **     notice, this list of conditions and the following disclaimer in the
   14 **     documentation and/or other materials provided with the distribution.
   15 **
   16 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   17 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   20 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   21 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   22 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   23 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   24 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   25 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   26 ** POSSIBILITY OF SUCH DAMAGE.
   27 ** ----------------------------------------------------------------------------
   28 ** What's New?
   29 **
   30 ** (2.0.1) Re-wrote the argument parser to be more lenient, and corrected a bug
   31 **         that made the last line of the pretty print output sometimes have
   32 **         the wrong address.
   33 **
   34 ** (2.0.0) Refactored large parts of the program.
   35 **
   36 ** (1.1.3) Moved the ASCII and hex character strings into the same variable in
   37 **         order to waste less space.
   38 **
   39 ** (1.1.2) Made the input size counter display clearer in the pretty printed
   40 **         output by prefixing it, aligning it and giving it a unit.
   41 **
   42 ** (1.1.0) Fixed a few C standard compatibility issues that were making the
   43 **         program fail to compile using tools such as lcc-win32.  Mainly
   44 **         stopped using the __VA_ARGS__ version of the RUN_ONCE macro, as it
   45 **         causes issues with some compilers.
   46 */
   47 #include <stdio.h>
   48 #include <string.h>
   49 #include <stdlib.h>
   50 #include <errno.h>
   51 
   52 
   53 /* --- CONFIGURATION ------------------------------------------------------- */
   54 
   55 
   56 #define VERSION "2.0.1"
   57 
   58 /* Changing this value guarantees that each row of the pretty-printed output
   59 ** will have exactly 16 8-bit hexidecimal values, which is common in many other
   60 ** utilities with similar functionality.  This must be a power of 2 (subtracted
   61 ** by one, e.g (1 << 4) - 1 == 15, the default).
   62 */
   63 #define LINE_MAX 15
   64 
   65 /* This value determines the byte-counter type. It needs to be something
   66 ** relatively large to handle absolutely yuuge inputs. A 64-bit max is probably
   67 ** fine, given that we only print 48-bit addresses.
   68 */
   69 #define CT_TYPE size_t
   70 #define CT_PRINT "zu"
   71 
   72 
   73 /* --- HELPER MACROS ------------------------------------------------------- */
   74 
   75 
   76 /* There are many ways to check for Windows, so do them all here. */
   77 #if defined(_WIN32) || defined(WIN32) || defined(__WIN32__)
   78 # define IS_WINDOWS
   79 #endif
   80 
   81 /* We need a macro to change the mode of the standard input to binary, or else
   82 ** platforms like Windows will do annoying things to the input without asking.
   83 */
   84 #ifdef IS_WINDOWS
   85 # include <io.h>
   86 # include <fcntl.h>
   87 # define SET_BINARY_MODE(handle) _setmode(_fileno(handle), _O_BINARY)
   88 #else
   89 # define SET_BINARY_MODE(handle)
   90 #endif
   91 
   92 /* This defines a macro for function inlining hints, because the compilers that
   93 ** support it usually also implement GNU C extensions generally.
   94 */
   95 #ifdef __GNUC__
   96 # if __GNUC__ < 5
   97 #  define __poly_inline inline
   98 # else
   99 #  define __poly_inline inline __attribute__((hot))
  100 # endif
  101 #else
  102 # define __poly_inline
  103 #endif
  104 
  105 #define ARG_ERR(...) do { printf(__VA_ARGS__); return 1; } while (0)
  106 
  107 /* Effectively provides a "do once" construct. */
  108 #define __once_l do { static int ro = 0; if (!ro) { ro = 1; do
  109 #define __once_r while (0); } } while (0)
  110 
  111 /* The setmode() trick doesn't work for some Windows compiler environments,
  112 ** namely Digital Mars, so I just implemented getchar() manually using the
  113 ** typical Windows API functions.
  114 */
  115 #ifdef IS_WINDOWS
  116 # include <windows.h>
  117 static int __poly_getchar()
  118         {
  119         static HANDLE wstdin;
  120         static DWORD wbr;
  121         static unsigned char wstdinbuf[1];
  122 
  123         __once_l
  124                 {
  125                 wstdin = GetStdHandle(STD_INPUT_HANDLE);
  126                 } __once_r;
  127         ReadFile(wstdin, wstdinbuf, 1, &wbr, NULL);
  128         if (!wbr) return EOF;
  129         return wstdinbuf[0];
  130         }
  131 #else
  132 # define __poly_getchar() getchar()
  133 #endif
  134 
  135 
  136 /* --- MAIN PROGRAM -------------------------------------------------------- */
  137 
  138 
  139 static int pretty_buf[LINE_MAX + 1];
  140 static int     ctr = 0;
  141 static CT_TYPE ct  = 0;
  142 
  143 static int pretty = 0, count = 0, help = 0, rlimit = 0,
  144            ascii  = 0, limit = 0, cols = 0, skip   = 0;
  145 
  146 static const char *readme =
  147         "------ Simple HeX (" VERSION ") -------"                          "\n"
  148         "Copyright (c) 2022, 2023 Amelia Zabardast Ziabari"              "\n\n"
  149         "%s [OPTION...] [FILENAME]"                                      "\n\n"
  150         "Where OPTION is one or more of the following:"                    "\n"
  151         "  -p        use formal pretty-printing instead of raw output"     "\n"
  152         "  -c        print the total length in octets"                     "\n"
  153         "  -h        print this help information"                          "\n"
  154         "  -sXXXXX   limit the number of octets to read (after skipping)"  "\n"
  155         "  -jXXXXX   skip a certain number of octets"                      "\n"
  156         "            (faster when using FILENAME instead of stdin)"        "\n"
  157         "  -a        when in raw output, print only filtered ASCII"        "\n"
  158         "  -l[XXX]   when in raw output, limit the output columns"         "\n"
  159         "            (the default value is 80)"                          "\n\n"
  160         "The FILENAME is optional, if it is not provided, stdin is used." "\n";
  161 
  162 /* This table of ASCII characters is used both when printing the ASCII column
  163 ** of the pretty-printed output AND the hexadecimal values.  For the former, a
  164 ** strict order of the characters is not required, so for the latter we can
  165 ** simply make sure the first 16 characters are the ones required for printing
  166 ** hex values in the correct order in order to save on binary size.
  167 **
  168 ** Technically it might be easier to just have a lookup table for every 8-bit
  169 ** hexadecimal value.  However, it's not like it would massively improve the
  170 ** performance or something like that, and it would result in a larger binary,
  171 ** (even if not by much in today's standards), so it's not really worth it even
  172 ** if doing it this way instead requires some smarter code.
  173 */
  174 static const char *ascii_chars =
  175         "0123456789"
  176         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  177         "abcdefghijklmnopqrstuvwxyz"
  178         "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ";
  179 
  180 
  181 static __poly_inline void ascii_print(int c)
  182         {
  183         if (strchr(ascii_chars, c) != NULL && c != 0)
  184                 putchar(c);
  185         else
  186                 putchar('.');
  187         }
  188 
  189 
  190 /* This is a crude way to check how many bytes are required to store an
  191 ** unsigned integer, to check for overflow.
  192 */
  193 static __poly_inline int bytes_req(CT_TYPE val)
  194         {
  195         int bytes = 0;
  196         while (val != 0)
  197                 {
  198                 val >>= 8;
  199                 bytes++;
  200                 }
  201         return bytes;
  202         }
  203 
  204 
  205 /* There appear to be some weird bugs in the standard C library on some
  206 ** platforms that break the handling of long long types in the printf family
  207 ** of functions.  I'm not sure why, but to remedy it I've written my own code
  208 ** to handle printing hexadecimal numbers in this function.  It also handles
  209 ** overflow, so if a number is too big to be represented, it will just blank it
  210 ** out with asterisks instead of messing up the alignment of the output.
  211 */
  212 static void hex_print(CT_TYPE val, int pad, const char *add)
  213         {
  214 #define ADD_BUFFER 8
  215 
  216         /* Create a buffer capable of holding the maximum printable size. */
  217         static char fmt[sizeof(CT_TYPE) * 2 + 1 + ADD_BUFFER] = {0};
  218 
  219         /* Calculates the overflow based on the available padding. */
  220         int i, overflow = bytes_req(val) > pad;
  221 
  222         /* Pad the empty space with zeros.  If val too large to print, fill the
  223         ** buffer with a warning character and then skip to printing.
  224         */
  225         memset(fmt, overflow ? '*' : '0', pad <<= 1);
  226         fmt[pad--] = 0;
  227 
  228         if (overflow) goto overflow_skip;
  229 
  230         /* Iterate over the padded space and map each index to a byte out of
  231         ** the input value, and then convert that byte to a hexadecimal string
  232         ** which can be pasted into the padding space.
  233         **/
  234         for (i = 0; i <= pad; i += 2)
  235                 {
  236                 unsigned char byte = ((unsigned char *) &val)[i >> 1];
  237                 /* "pad - i" makes sure it starts writing from the end of the
  238                 ** value instead of the start, ensuring a correct order for
  239                 ** inputs larger than 8 bits.
  240                 */
  241                 fmt[pad - i - 1] = ascii_chars[byte >> 4];
  242                 fmt[pad - i    ] = ascii_chars[byte & 15];
  243                 }
  244 
  245 overflow_skip:
  246         /* XXX: It may be faster to do things this way than to concatenate the
  247         ** strings with printf.  Unsure?
  248         */
  249         if (strlen(add) < ADD_BUFFER) strcat(fmt, add);
  250         fputs((const char *) fmt, stdout);
  251         }
  252 
  253 
  254 static void dump_pretty_buf(int exit)
  255         {
  256         /* Only print anything if we're at the end of one pretty output line.
  257         ** If we just broke out of the getchar loop, then there might be some
  258         ** bytes left in the buffer that didn't quite fill a line, so we will
  259         ** print it as long as the loop left the buffer unfinished.
  260         */
  261         int e, pad, unfin = ctr == LINE_MAX;
  262 
  263         if (exit ? unfin : !unfin) return;
  264 
  265         /* Print the address as a 48-bit hex integer, assuming a max size of
  266         ** 256 TiB, just in case the counter type changes at some point.
  267         */
  268         hex_print(ct - ctr, 6, ": ");
  269 
  270         /* Prints the actual prettified hex view. */
  271         for (e = 0; e <= LINE_MAX; e++)
  272                 {
  273                 /* Prints value if buffer slot filled, else do padding. */
  274                 pad = !(e & 1);
  275                 if (e <= ctr) hex_print(pretty_buf[e], 1, pad ? "" : " ");
  276                 else fputs(pad ? "  " : "   ", stdout);
  277                 }
  278         fputs("| ", stdout);
  279 
  280         /* Only used buffer slots are iterated to print ASCII values, as we
  281         ** have no need to add padding for any further columns.
  282         */
  283         for (e = 0; e <= ctr; e++)
  284                 ascii_print(pretty_buf[e]);
  285         putchar('\n');
  286         }
  287 
  288 
  289 int main(int argc, char** argv)
  290         {
  291         int c, argn;
  292         char *filen = NULL;
  293         FILE *fd;
  294 
  295         /* This is a grossly complex argument parser intended to handle many
  296         ** plausible edge cases and scenarios.
  297         ** TODO: Either document this or replace it with getopt?
  298         */
  299         for (argn = 1; argn < argc; argn++)
  300                 {
  301                 if (strchr(argv[argn], '-') == argv[argn])
  302                         {
  303 #define STRCH_ARG(a) strchr(argv[argn], a)
  304 #define CHECK_ARG(a) if (STRCH_ARG(a) != NULL)
  305                         CHECK_ARG('p') pretty = 1;
  306                         CHECK_ARG('c') count  = 1;
  307                         CHECK_ARG('h') help   = 1;
  308                         CHECK_ARG('a') ascii  = 1;
  309                         CHECK_ARG('l') limit  = 1;
  310 #define GO_INT(v, a)       v = (int) strtol(STRCH_ARG(a) + 1, NULL, 0)
  311 #define GO_CHK(v)          if (errno == ERANGE || v < 1)
  312 #define GO_VAL(a)          if (*(STRCH_ARG(a) + 1) != '\0')
  313 #define GO_ER1(v, s)       ARG_ERR("invalid " s " specified (%d)\n", v)
  314 #define GO_ER2(s)          ARG_ERR("no " s " specified\n")
  315 #define GO_DUP(a, v, s)    GO_VAL(a) \
  316         { GO_INT(v, a); GO_CHK(v) { GO_ER1(v, s); } }
  317 #define GO_DEF(a, v, s, d) CHECK_ARG(a) { GO_DUP(a, v, s) else { v = d; } }
  318 #define GO_REQ(a, v, s)    CHECK_ARG(a) { GO_DUP(a, v, s) else { GO_ER2(s); } }
  319                         GO_DEF('l', cols,   "column limit", 80);
  320                         GO_REQ('s', rlimit, "read limit");
  321                         GO_REQ('j', skip,   "data skip");
  322                         }
  323                 else if (argn == argc - 1)
  324                         {
  325                         filen = argv[argn];
  326                         }
  327                 }
  328         if (!ascii) cols >>= 1;
  329         if (filen)
  330                 {
  331                 if (!(fd = fopen(filen, "rb")))
  332                         ARG_ERR("could not open file (%s)\n", filen);
  333                 else if (skip)
  334                         {
  335                         fseek(fd, skip, SEEK_SET);
  336                         skip = 0;
  337                         }
  338                 }
  339         else
  340                 SET_BINARY_MODE(stdin);
  341         if (help)
  342                 {
  343                 printf(readme, argv[0]);
  344                 return 1;
  345                 }
  346         while ((c = filen ? fgetc(fd) : __poly_getchar()) != EOF)
  347                 {
  348                 /* If we need to skip, we just re-use count mode and then reset
  349                 ** the stream statistics and disable skip mode once the target
  350                 ** has been reached.
  351                 */
  352                 if (skip && ct < (size_t) skip) goto c_only;
  353                 else __once_l
  354                         {
  355                         ct   = 0;
  356                         skip = 0;
  357                         } __once_r;
  358 
  359                 if (rlimit && ct >= (size_t) rlimit) break;
  360 
  361                 if (!pretty)
  362                         {
  363                         /* Count mode doesn't hide the output if pretty. */
  364                         if (count) goto c_only;
  365 
  366                         /* Both of these are raw output with no spaces, etc */
  367                         if (!ascii) hex_print(c, 1, "");
  368                         else ascii_print(c);
  369 
  370                         /* Print a newline if we've handled the last character
  371                         ** in a column.  Similar logic to CTR in pretty mode.
  372                         */
  373                         if (limit && ct % cols == cols - 1)
  374                                 putchar('\n');
  375                         }
  376                 else
  377                         {
  378                         ctr = ct & LINE_MAX;
  379                         pretty_buf[ctr] = c;
  380 
  381                         /* This can safely be called for each byte, because the
  382                         ** check to see if we're at the end of a line happens
  383                         ** within the function itself.
  384                         */
  385                         dump_pretty_buf(0);
  386                         }
  387 c_only:
  388                 ct++;
  389                 }
  390         if (fd) fclose(fd);
  391         if (pretty)
  392                 {
  393                 ct--;
  394                 dump_pretty_buf(1);
  395                 if (count)
  396                         printf("       TOTAL: %" CT_PRINT " OCTETS\n", ++ct);
  397                 }
  398         else if (count)
  399                 printf("%" CT_PRINT "\n", ct);
  400         else
  401                 printf("\n");
  402 
  403         return 0;
  404         }