/* Simple HeX (SHX)
** A simplified, dependency-less, and hopefully portable utility for converting
** data into a hexadecimal representation.                                (C99)
** ----------------------------------------------------------------------------
** Copyright (c) 2022, 2023 Amelia Zabardast Ziabari
**
** Redistribution and use in source and binary forms, with or without
** modification, are permitted provided that the following conditions are met:
**
**  1. Redistributions of source code must retain the above copyright notice,
**     this list of conditions and the following disclaimer.
**  2. Redistributions in binary form must reproduce the above copyright
**     notice, this list of conditions and the following disclaimer in the
**     documentation and/or other materials provided with the distribution.
**
** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
** POSSIBILITY OF SUCH DAMAGE.
** ----------------------------------------------------------------------------
** What's New?
**
** (2.0.1) Re-wrote the argument parser to be more lenient, and corrected a bug
**         that made the last line of the pretty print output sometimes have
**         the wrong address.
**
** (2.0.0) Refactored large parts of the program.
**
** (1.1.3) Moved the ASCII and hex character strings into the same variable in
**         order to waste less space.
**
** (1.1.2) Made the input size counter display clearer in the pretty printed
**         output by prefixing it, aligning it and giving it a unit.
**
** (1.1.0) Fixed a few C standard compatibility issues that were making the
**         program fail to compile using tools such as lcc-win32.  Mainly
**         stopped using the __VA_ARGS__ version of the RUN_ONCE macro, as it
**         causes issues with some compilers.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>


/* --- CONFIGURATION ------------------------------------------------------- */


#define VERSION "2.0.1"

/* Changing this value guarantees that each row of the pretty-printed output
** will have exactly 16 8-bit hexidecimal values, which is common in many other
** utilities with similar functionality.  This must be a power of 2 (subtracted
** by one, e.g (1 << 4) - 1 == 15, the default).
*/
#define LINE_MAX 15

/* This value determines the byte-counter type. It needs to be something
** relatively large to handle absolutely yuuge inputs. A 64-bit max is probably
** fine, given that we only print 48-bit addresses.
*/
#define CT_TYPE size_t
#define CT_PRINT "zu"


/* --- HELPER MACROS ------------------------------------------------------- */


/* There are many ways to check for Windows, so do them all here. */
#if defined(_WIN32) || defined(WIN32) || defined(__WIN32__)
# define IS_WINDOWS
#endif

/* We need a macro to change the mode of the standard input to binary, or else
** platforms like Windows will do annoying things to the input without asking.
*/
#ifdef IS_WINDOWS
# include <io.h>
# include <fcntl.h>
# define SET_BINARY_MODE(handle) _setmode(_fileno(handle), _O_BINARY)
#else
# define SET_BINARY_MODE(handle)
#endif

/* This defines a macro for function inlining hints, because the compilers that
** support it usually also implement GNU C extensions generally.
*/
#ifdef __GNUC__
# if __GNUC__ < 5
#  define __poly_inline inline
# else
#  define __poly_inline inline __attribute__((hot))
# endif
#else
# define __poly_inline
#endif

#define ARG_ERR(...) do { printf(__VA_ARGS__); return 1; } while (0)

/* Effectively provides a "do once" construct. */
#define __once_l do { static int ro = 0; if (!ro) { ro = 1; do
#define __once_r while (0); } } while (0)

/* The setmode() trick doesn't work for some Windows compiler environments,
** namely Digital Mars, so I just implemented getchar() manually using the
** typical Windows API functions.
*/
#ifdef IS_WINDOWS
# include <windows.h>
static int __poly_getchar()
	{
	static HANDLE wstdin;
	static DWORD wbr;
	static unsigned char wstdinbuf[1];

	__once_l
		{
		wstdin = GetStdHandle(STD_INPUT_HANDLE);
		} __once_r;
	ReadFile(wstdin, wstdinbuf, 1, &wbr, NULL);
	if (!wbr) return EOF;
	return wstdinbuf[0];
	}
#else
# define __poly_getchar() getchar()
#endif


/* --- MAIN PROGRAM -------------------------------------------------------- */


static int pretty_buf[LINE_MAX + 1];
static int     ctr = 0;
static CT_TYPE ct  = 0;

static int pretty = 0, count = 0, help = 0, rlimit = 0,
           ascii  = 0, limit = 0, cols = 0, skip   = 0;

static const char *readme =
	"------ Simple HeX (" VERSION ") -------"                          "\n"
	"Copyright (c) 2022, 2023 Amelia Zabardast Ziabari"              "\n\n"
	"%s [OPTION...] [FILENAME]"                                      "\n\n"
	"Where OPTION is one or more of the following:"                    "\n"
	"  -p        use formal pretty-printing instead of raw output"     "\n"
	"  -c        print the total length in octets"                     "\n"
	"  -h        print this help information"                          "\n"
	"  -sXXXXX   limit the number of octets to read (after skipping)"  "\n"
	"  -jXXXXX   skip a certain number of octets"                      "\n"
	"            (faster when using FILENAME instead of stdin)"        "\n"
	"  -a        when in raw output, print only filtered ASCII"        "\n"
	"  -l[XXX]   when in raw output, limit the output columns"         "\n"
	"            (the default value is 80)"                          "\n\n"
	"The FILENAME is optional, if it is not provided, stdin is used." "\n";

/* This table of ASCII characters is used both when printing the ASCII column
** of the pretty-printed output AND the hexadecimal values.  For the former, a
** strict order of the characters is not required, so for the latter we can
** simply make sure the first 16 characters are the ones required for printing
** hex values in the correct order in order to save on binary size.
**
** Technically it might be easier to just have a lookup table for every 8-bit
** hexadecimal value.  However, it's not like it would massively improve the
** performance or something like that, and it would result in a larger binary,
** (even if not by much in today's standards), so it's not really worth it even
** if doing it this way instead requires some smarter code.
*/
static const char *ascii_chars =
	"0123456789"
	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	"abcdefghijklmnopqrstuvwxyz"
	"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ";


static __poly_inline void ascii_print(int c)
	{
	if (strchr(ascii_chars, c) != NULL && c != 0)
		putchar(c);
	else
		putchar('.');
	}


/* This is a crude way to check how many bytes are required to store an
** unsigned integer, to check for overflow.
*/
static __poly_inline int bytes_req(CT_TYPE val)
	{
	int bytes = 0;
	while (val != 0)
		{
		val >>= 8;
		bytes++;
		}
	return bytes;
	}


/* There appear to be some weird bugs in the standard C library on some
** platforms that break the handling of long long types in the printf family
** of functions.  I'm not sure why, but to remedy it I've written my own code
** to handle printing hexadecimal numbers in this function.  It also handles
** overflow, so if a number is too big to be represented, it will just blank it
** out with asterisks instead of messing up the alignment of the output.
*/
static void hex_print(CT_TYPE val, int pad, const char *add)
	{
#define ADD_BUFFER 8

	/* Create a buffer capable of holding the maximum printable size. */
	static char fmt[sizeof(CT_TYPE) * 2 + 1 + ADD_BUFFER] = {0};

	/* Calculates the overflow based on the available padding. */
	int i, overflow = bytes_req(val) > pad;

	/* Pad the empty space with zeros.  If val too large to print, fill the
	** buffer with a warning character and then skip to printing.
	*/
	memset(fmt, overflow ? '*' : '0', pad <<= 1);
	fmt[pad--] = 0;

	if (overflow) goto overflow_skip;

	/* Iterate over the padded space and map each index to a byte out of
	** the input value, and then convert that byte to a hexadecimal string
	** which can be pasted into the padding space.
	**/
	for (i = 0; i <= pad; i += 2)
		{
		unsigned char byte = ((unsigned char *) &val)[i >> 1];
		/* "pad - i" makes sure it starts writing from the end of the
		** value instead of the start, ensuring a correct order for
		** inputs larger than 8 bits.
		*/
		fmt[pad - i - 1] = ascii_chars[byte >> 4];
		fmt[pad - i    ] = ascii_chars[byte & 15];
		}

overflow_skip:
	/* XXX: It may be faster to do things this way than to concatenate the
	** strings with printf.  Unsure?
	*/
	if (strlen(add) < ADD_BUFFER) strcat(fmt, add);
	fputs((const char *) fmt, stdout);
	}


static void dump_pretty_buf(int exit)
	{
	/* Only print anything if we're at the end of one pretty output line.
	** If we just broke out of the getchar loop, then there might be some
	** bytes left in the buffer that didn't quite fill a line, so we will
	** print it as long as the loop left the buffer unfinished.
	*/
	int e, pad, unfin = ctr == LINE_MAX;

	if (exit ? unfin : !unfin) return;

	/* Print the address as a 48-bit hex integer, assuming a max size of
	** 256 TiB, just in case the counter type changes at some point.
	*/
	hex_print(ct - ctr, 6, ": ");

	/* Prints the actual prettified hex view. */
	for (e = 0; e <= LINE_MAX; e++)
		{
		/* Prints value if buffer slot filled, else do padding. */
		pad = !(e & 1);
		if (e <= ctr) hex_print(pretty_buf[e], 1, pad ? "" : " ");
		else fputs(pad ? "  " : "   ", stdout);
		}
	fputs("| ", stdout);

	/* Only used buffer slots are iterated to print ASCII values, as we
	** have no need to add padding for any further columns.
	*/
	for (e = 0; e <= ctr; e++)
		ascii_print(pretty_buf[e]);
	putchar('\n');
	}


int main(int argc, char** argv)
	{
	int c, argn;
	char *filen = NULL;
	FILE *fd;

	/* This is a grossly complex argument parser intended to handle many
	** plausible edge cases and scenarios.
	** TODO: Either document this or replace it with getopt?
	*/
	for (argn = 1; argn < argc; argn++)
		{
		if (strchr(argv[argn], '-') == argv[argn])
			{
#define STRCH_ARG(a) strchr(argv[argn], a)
#define CHECK_ARG(a) if (STRCH_ARG(a) != NULL)
			CHECK_ARG('p') pretty = 1;
			CHECK_ARG('c') count  = 1;
			CHECK_ARG('h') help   = 1;
			CHECK_ARG('a') ascii  = 1;
			CHECK_ARG('l') limit  = 1;
#define GO_INT(v, a)       v = (int) strtol(STRCH_ARG(a) + 1, NULL, 0)
#define GO_CHK(v)          if (errno == ERANGE || v < 1)
#define GO_VAL(a)          if (*(STRCH_ARG(a) + 1) != '\0')
#define GO_ER1(v, s)       ARG_ERR("invalid " s " specified (%d)\n", v)
#define GO_ER2(s)          ARG_ERR("no " s " specified\n")
#define GO_DUP(a, v, s)    GO_VAL(a) \
	{ GO_INT(v, a); GO_CHK(v) { GO_ER1(v, s); } }
#define GO_DEF(a, v, s, d) CHECK_ARG(a) { GO_DUP(a, v, s) else { v = d; } }
#define GO_REQ(a, v, s)    CHECK_ARG(a) { GO_DUP(a, v, s) else { GO_ER2(s); } }
			GO_DEF('l', cols,   "column limit", 80);
			GO_REQ('s', rlimit, "read limit");
			GO_REQ('j', skip,   "data skip");
			}
		else if (argn == argc - 1)
			{
			filen = argv[argn];
			}
		}
	if (!ascii) cols >>= 1;
	if (filen)
		{
		if (!(fd = fopen(filen, "rb")))
			ARG_ERR("could not open file (%s)\n", filen);
		else if (skip)
			{
			fseek(fd, skip, SEEK_SET);
			skip = 0;
			}
		}
	else
		SET_BINARY_MODE(stdin);
	if (help)
		{
		printf(readme, argv[0]);
		return 1;
		}
	while ((c = filen ? fgetc(fd) : __poly_getchar()) != EOF)
		{
		/* If we need to skip, we just re-use count mode and then reset
		** the stream statistics and disable skip mode once the target
		** has been reached.
		*/
		if (skip && ct < (size_t) skip) goto c_only;
		else __once_l
			{
			ct   = 0;
			skip = 0;
			} __once_r;

		if (rlimit && ct >= (size_t) rlimit) break;

		if (!pretty)
			{
			/* Count mode doesn't hide the output if pretty. */
			if (count) goto c_only;

			/* Both of these are raw output with no spaces, etc */
			if (!ascii) hex_print(c, 1, "");
			else ascii_print(c);

			/* Print a newline if we've handled the last character
			** in a column.  Similar logic to CTR in pretty mode.
			*/
			if (limit && ct % cols == cols - 1)
				putchar('\n');
			}
		else
			{
			ctr = ct & LINE_MAX;
			pretty_buf[ctr] = c;

			/* This can safely be called for each byte, because the
			** check to see if we're at the end of a line happens
			** within the function itself.
			*/
			dump_pretty_buf(0);
			}
c_only:
		ct++;
		}
	if (fd) fclose(fd);
	if (pretty)
		{
		ct--;
		dump_pretty_buf(1);
		if (count)
			printf("       TOTAL: %" CT_PRINT " OCTETS\n", ++ct);
		}
	else if (count)
		printf("%" CT_PRINT "\n", ct);
	else
		printf("\n");

	return 0;
	}
