From 71aaa3f702977f3d87e96170dc29fb14e4757688 Mon Sep 17 00:00:00 2001 From: Philippe Proulx Date: Mon, 25 Sep 2023 17:16:28 -0400 Subject: [PATCH] The earliest commit (also Normand 0.1.0) Signed-off-by: Philippe Proulx --- .gitignore | 162 +++++ README.adoc | 931 +++++++++++++++++++++++++++ normand/__init__.py | 26 + normand/normand.py | 1464 +++++++++++++++++++++++++++++++++++++++++++ poetry.lock | 7 + pyproject.toml | 66 ++ 6 files changed, 2656 insertions(+) create mode 100644 .gitignore create mode 100644 README.adoc create mode 100644 normand/__init__.py create mode 100644 normand/normand.py create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c510331 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +README.html diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..c3ebe7f --- /dev/null +++ b/README.adoc @@ -0,0 +1,931 @@ += Normand +Philippe Proulx +:toc: left + +:py3: Python{nbsp}3 + +[.normal] +image:https://img.shields.io/pypi/v/normand.svg?label=Latest%20version[link="https://pypi.python.org/pypi/normand"] + +[.lead] +_**Normand**_ is a text-to-binary processor with its own language. + +This package offers both a portable {py3} module and a command-line +tool. + +WARNING: This version of Normand is 0.1, meaning both the Normand +language and the module/CLI interface aren't stable. + +== Introduction + +The purpose of Normand is to consume human-readable text representing +bytes and to produce the corresponding binary data. + +.Simple bytes input. +==== +Consider the following Normand input: + +---- +4f 55 32 bb $167 fe %10100111 a9 $-32 +---- + +The generated nine bytes are: + +---- +4f 55 32 bb a7 fe a7 a9 e0 +---- +==== + +As you can see in the last example, the fundamental unit of the Normand +language is the _byte_. The order in which you list bytes will be the +order of the generated data. + +The Normand language is more than simple lists of bytes, though. Its +main features are: + +Comments, including a bunch of insignificant symbols which may improve readability:: ++ +Input: ++ +---- +ff bb %1101:0010 # This is a comment +78 29 af $192 # This too # 99 $-80 +fe80::6257:18ff:fea3:4229 +60:57:18:a3:42:29 +10839636-5d65-4a68-8e6a-21608ddf7258 +---- ++ +Output: ++ +---- +ff bb d2 78 29 af c0 99 b0 fe 80 62 57 18 ff fe +a3 42 29 60 57 18 a3 42 29 10 83 96 36 5d 65 4a +68 8e 6a 21 60 8d df 72 58 +---- + +Hexadecimal, decimal, and binary byte constants:: ++ +Input: ++ +---- +aa bb $247 $-89 %0011_0010 %11.01= 10/10 +---- ++ +Output: ++ +---- +aa bb f7 a7 32 da +---- + +UTF-8, UTF-16, and UTF-32 literal strings:: ++ +Input: ++ +---- +"hello world!" 00 +u16le"stress\nverdict 🤣" +---- ++ +Output: ++ +---- +68 65 6c 6c 6f 20 77 6f 72 6c 64 21 00 73 00 74 ┆ hello world!•s•t +00 72 00 65 00 73 00 73 00 0a 00 76 00 65 00 72 ┆ •r•e•s•s•••v•e•r +00 64 00 69 00 63 00 74 00 20 00 3e d8 23 dd ┆ •d•i•c•t• •>•#• +---- + +Labels: special variables holding the offset where they're defined:: ++ +---- + b2 52 e3 bc 91 05 +$100 $50 33 9f fe +25 e9 89 8a +---- + +Variables:: ++ +---- +5e 65 {tower = 47} c6 7f f2 c4 +44 {hurl = tower - 14} b5 {tower = hurl} 26 2d +---- ++ +The value of a variable assignment is the evaluation of a valid {py3} +expression which may include label and variable names. + +Value encoding with a specific length (8{nbsp}bits to 64{nbsp}bits) and byte order:: ++ +Input: ++ +---- +{strength = 4} +{be} 67 44 $178 {(end - lbl) * 8 + strength : 16} $99 +{le} {-1993 : 32} +---- ++ +Output: ++ +---- +67 44 b2 00 2c 63 37 f8 ff ff +---- ++ +The encoded value is the evaluation of a valid {py3} expression which +may include label and variable names. + +Repetition:: ++ +Input: ++ +---- +aa bb * 5 cc "yeah\0" * 8 +---- ++ +Output: ++ +---- +aa bb bb bb bb bb cc 79 65 61 68 00 79 65 61 68 ┆ •••••••yeah.yeah +00 79 65 61 68 00 79 65 61 68 00 79 65 61 68 00 ┆ •yeah•yeah•yeah• +79 65 61 68 00 79 65 61 68 00 79 65 61 68 00 ┆ yeah•yeah•yeah• +---- + + +Multilevel grouping:: ++ +Input: ++ +---- +ff ((aa bb "zoom" cc) * 5) * 3 $-34 * 4 +---- ++ +Output: ++ +---- +ff aa bb 7a 6f 6f 6d cc aa bb 7a 6f 6f 6d cc aa ┆ •••zoom•••zoom•• +bb 7a 6f 6f 6d cc aa bb 7a 6f 6f 6d cc aa bb 7a ┆ •zoom•••zoom•••z +6f 6f 6d cc aa bb 7a 6f 6f 6d cc aa bb 7a 6f 6f ┆ oom•••zoom•••zoo +6d cc aa bb 7a 6f 6f 6d cc aa bb 7a 6f 6f 6d cc ┆ m•••zoom•••zoom• +aa bb 7a 6f 6f 6d cc aa bb 7a 6f 6f 6d cc aa bb ┆ ••zoom•••zoom••• +7a 6f 6f 6d cc aa bb 7a 6f 6f 6d cc aa bb 7a 6f ┆ zoom•••zoom•••zo +6f 6d cc aa bb 7a 6f 6f 6d cc de de de de ┆ om•••zoom••••• +---- + +Precise error reporting:: ++ +---- +/tmp/meow.normand:10:24 - Expecting a bit (`0` or `1`). +---- ++ +---- +/tmp/meow.normand:32:6 - Unexpected character `k`. +---- ++ +---- +/tmp/meow.normand:24:19 - Unknown variable/label name `meow` in expression `(meow - 45) // 8`. +---- ++ +---- +/tmp/meow.normand:18:9 - Value 315 is outside the 8-bit range when evaluating expression `end - ICITTE` at byte offset 45. +---- + +You can use Normand to track data source files in your favorite VCS +instead of raw binary files. The binary files that Normand generates can +be used to test file format decoding, including malformatted data, for +example, as well as for education. + +See <> to explore all the Normand features. + +== Install Normand + +Normand requires Python ≥ 3.4. + +To install Normand: + +---- +$ python3 -m pip install --user normand +---- + +See +https://packaging.python.org/en/latest/tutorials/installing-packages/#installing-to-the-user-site[Installing to the User Site] +to learn more about a user site installation. + +[NOTE] +==== +Normand has a single module file, `normand.py`, which you can copy as is +to your project to use it (both the <> +function and the <>). + +`normand.py` has _no external dependencies_, but if you're using +Python{nbsp}3.4, you'll need a local copy of the standard `typing` +module. +==== + +== Learn Normand + +A Normand text input is a sequence of items which represent a sequence +of raw bytes. + +[[state]] During the processing of items to data, Normand relies on a +current state: + +[%header%autowidth] +|=== +|State variable |Description |Initial value: <> |Initial value: <> + +|[[cur-offset]] Current offset +| +The current offset has an effect on the value of +<> and of the special `ICITTE` name in <> and +<> expression evaluation. + +Each generated byte increments the current offset. + +A <> may change the +current offset. +|`init_offset` parameter of the `parse()` function. +|`--offset` option. + +|[[cur-bo]] Current byte order +| +The current byte order has an effect on the encoding of <>. + +A <> may change +the current byte order. +|`init_byte_order` parameter of the `parse()` function. +|`--byte-order` option. + +|<> +|Mapping of label names to integral values. +|`init_labels` parameter of the `parse()` function. +|One or more `--label` options. + +|<> +|Mapping of variable names to integral values. +|`init_variables` parameter of the `parse()` function. +|One or more `--var` options. +|=== + +The available items are: + +* A <> representing a single byte. + +* A <> representing a sequence of bytes + encoding UTF-8, UTF-16, or UTF-32 data. + +* A <> (big or + little endian). + +* A <> as an unsigned or signed + integer to be encoded on one or more bytes using the current byte + order. + +* A <>. + +* A <>, that is, a named constant holding the current + offset. ++ +This is similar to an assembly label. + +* A <> associating a name to + the integral result of an evaluated {py3} expression. + +* A <>, that is, a scoped sequence of items. + +Moreover, you can <> any item above, except an offset +or a label, a given number of times. This is called a repetition. + +A Normand comment may exist: + +* Between items, possibly within a group. +* Between the nibbles of a constant hexadecimal byte. +* Between the bits of a constant binary byte. +* Between the last item and the ``pass:[*]`` character of a repetition, + and between that ``pass:[*]`` character and the following number. + +A comment is anything between two ``pass:[#]`` characters on the same +line, or from ``pass:[#]`` until the end of the line. Whitespaces and +the following symbol characters are also considered comments where a +comment may exist: + +---- +! @ / \ ? & : ; . , + [ ] _ = | - +---- + +The latter serve to improve readability so that you may write, for +example, a MAC address or a UUID as is. + +You can test the examples of this section with the `normand` +<> as such: + +---- +$ normand file | hexdump -C +---- + +where `file` is the name of a file containing the Normand input. + +=== Byte constant + +A _byte constant_ represents a single byte. + +A byte constant is: + +Hexadecimal form:: + Two consecutive hexits. + +Decimal form:: + A decimal number after the `$` prefix. + +Binary form:: + Eight bits after the `%` prefix. + +==== +Input: + +---- +ab cd [3d 8F] CC +---- + +Output: + +---- +ab cd 3d 8f cc +---- +==== + +==== +Input: + +---- +$192 %1100/0011 $ -77 +---- + +Output: + +---- +c0 c3 b3 +---- +==== + +==== +Input: + +---- +58f64689-6316-4d55-8a1a-04cada366172 +fe80::6257:18ff:fea3:4229 +---- + +Output: + +---- +58 f6 46 89 63 16 4d 55 8a 1a 04 ca da 36 61 72 ┆ X•F•c•MU•••••6ar +fe 80 62 57 18 ff fe a3 42 29 ┆ ••bW••••B) +---- +==== + +==== +Input: + +---- +%01110011 %01100001 %01101100 %01110101 %01110100 +---- + +Output: + +---- +73 61 6c 75 74 ┆ salut +---- +==== + +=== Literal string + +A _literal string_ represents the UTF-8-, UTF-16-, or UTF-32-encoded +bytes of a string. + +The string to encode isn't implicitly null-terminated: use `\0` at the +end of the string to add a null character. + +A literal string is: + +. **Optional**: one of the following encodings instead of UTF-8: ++ +-- +[horizontal] +`u16be`:: UTF-16BE. +`u16le`:: UTF-16LE. +`u32be`:: UTF-32BE. +`u32le`:: UTF-32LE. +-- + +. The ``pass:["]`` prefix. + +. A sequence of zero or more characters, possibly containing escape + sequences. ++ +An escape sequence is the ``\`` character followed by one of: ++ +-- +[horizontal] +`0`:: Null (U+0000) +`a`:: Alert (U+0007) +`b`:: Backspace (U+0008) +`e`:: Escape (U+001B) +`f`:: Form feed (U+000C) +`n`:: End of line (U+000A) +`r`:: Carriage return (U+000D) +`t`:: Character tabulation (U+0009) +`v`:: Line tabulation (U+000B) +``\``:: Reverse solidus (U+005C) +``pass:["]``:: Quotation mark (U+0022) +-- + +. The ``pass:["]`` suffix. + +==== +Input: + +---- +"coucou tout le monde!" +---- + +Output: + +---- +63 6f 75 63 6f 75 20 74 6f 75 74 20 6c 65 20 6d ┆ coucou tout le m +6f 6e 64 65 21 ┆ onde! +---- +==== + +==== +Input: + +---- +u16le"I am not young enough to know everything." +---- + +Output: + +---- +49 00 20 00 61 00 6d 00 20 00 6e 00 6f 00 74 00 ┆ I• •a•m• •n•o•t• +20 00 79 00 6f 00 75 00 6e 00 67 00 20 00 65 00 ┆ •y•o•u•n•g• •e• +6e 00 6f 00 75 00 67 00 68 00 20 00 74 00 6f 00 ┆ n•o•u•g•h• •t•o• +20 00 6b 00 6e 00 6f 00 77 00 20 00 65 00 76 00 ┆ •k•n•o•w• •e•v• +65 00 72 00 79 00 74 00 68 00 69 00 6e 00 67 00 ┆ e•r•y•t•h•i•n•g• +2e 00 ┆ .• +---- +==== + +==== +Input: + +---- +u32be "\"illusion is the first\nof all pleasures\" 🦉" +---- + +Output: + +---- +00 00 00 22 00 00 00 69 00 00 00 6c 00 00 00 6c ┆ •••"•••i•••l•••l +00 00 00 75 00 00 00 73 00 00 00 69 00 00 00 6f ┆ •••u•••s•••i•••o +00 00 00 6e 00 00 00 20 00 00 00 69 00 00 00 73 ┆ •••n••• •••i•••s +00 00 00 20 00 00 00 74 00 00 00 68 00 00 00 65 ┆ ••• •••t•••h•••e +00 00 00 20 00 00 00 66 00 00 00 69 00 00 00 72 ┆ ••• •••f•••i•••r +00 00 00 73 00 00 00 74 00 00 00 0a 00 00 00 6f ┆ •••s•••t•••••••o +00 00 00 66 00 00 00 20 00 00 00 61 00 00 00 6c ┆ •••f••• •••a•••l +00 00 00 6c 00 00 00 20 00 00 00 70 00 00 00 6c ┆ •••l••• •••p•••l +00 00 00 65 00 00 00 61 00 00 00 73 00 00 00 75 ┆ •••e•••a•••s•••u +00 00 00 72 00 00 00 65 00 00 00 73 00 00 00 22 ┆ •••r•••e•••s•••" +00 00 00 20 00 01 f9 89 ┆ ••• •••• +---- +==== + +=== Current byte order setting + +This special item sets the <>. + +The two accepted forms are: + +[horizontal] +``pass:[{be}]``:: Set the current byte order to big endian. +``pass:[{le}]``:: Set the current byte order to little endian. + +=== Value + +A _value_ represents a fixed number of bytes encoding an unsigned or +signed integer which is the result of evaluating a {py3} expression +using the <>. + +For a value at some source location{nbsp}__**L**__, its {py3} expression +may contain the name of any accessible <>, including the +name of a label defined after{nbsp}__**L**__, as well as the name of any +<> known at{nbsp}__**L**__. + +An accessible label is either: + +* Outside of the current <>. +* Within the same immediate group (not within a nested group). + +In the {py3} expression of a value, the value of the special name +`ICITTE` is the <> (before encoding the +value). + +A value is: + +. The ``pass:[{]`` prefix. + +. A valid {py3} expression. + +. The `:` character. + +. An encoding length in bits amongst `8`, `16`, `24`, `32`, `40`, + `48`, `56`, and `64`. + +. The `}` suffix. + +==== +Input: + +---- +{le} {345:16} +{be} {-0xabcd:32} +---- + +Output: + +---- +59 01 ff ff 54 33 +---- +==== + +==== +Input: + +---- +{be} + +# String length in bits +{8 * (str_end - str_beg) : 16} + +# String + + "hello world!" + +---- + +Output: + +---- +00 60 68 65 6c 6c 6f 20 77 6f 72 6c 64 21 ┆ •`hello world! +---- +==== + +==== +Input: + +---- +{20 - ICITTE : 8} * 10 +---- + +Output: + +---- +14 13 12 11 10 0f 0e 0d 0c 0b +---- +==== + +=== Current offset setting + +This special item sets the <>. + +A current offset setting is: + +. The `<` prefix. + +. A positive integer (hexadecimal starting with `0x` or `0X` accepted) + which is the new current offset. + +. The `>` suffix. + +==== +Input: + +---- + {ICITTE : 8} * 8 +<0x61> {ICITTE : 8} * 8 +---- + +Output: + +---- +00 01 02 03 04 05 06 07 61 62 63 64 65 66 67 68 ┆ ••••••••abcdefgh +---- +==== + +==== +Input: + +---- +aa bb cc dd ee ff +<12> 11 22 33 44 55 +{meow : 8} {mix : 8} +---- + +Output: + +---- +aa bb cc dd ee ff 11 22 33 44 55 04 0f ┆ •••••••"3DU•• +---- +==== + +=== Label + +A _label_ associates a name to the <>. + +All the labels of a whole Normand input must have unique names. + +A label may not share the name of a <> +name. + +A label name may not be `ICITTE` (see <> and +<> to learn more). + +A label is: + +. The `<` prefix. + +. A valid {py3} name which is not `ICITTE`. + +. The `>` suffix. + +=== Variable assignment + +A _variable assignment_ associates a name to the integral result of an +evaluated {py3} expression. + +For a variable assignment at some source location{nbsp}__**L**__, its +{py3} expression may contain the name of any accessible <>, +including the name of a label defined after{nbsp}__**L**__, as well as +the name of any variable known at{nbsp}__**L**__. + +An accessible label is either: + +* Outside of the current <>. +* Within the same immediate group (not within a nested group). + +A variable name may not be `ICITTE` (see <> and +<> to learn more). + +In the {py3} expression of a variable assignment, the special name +`ICITTE` is the <>. + +A variable is: + +. The ``pass:[{]`` prefix. + +. A valid {py3} name which is not `ICITTE`. + +. The `=` character. + +. A valid {py3} expression. + +. The `}` suffix. + +==== +Input: + +---- +{mix = 101} {le} +{meow = 42} 11 22 {meow:8} 33 {meow = ICITTE + 17} +"yooo" {meow + mix : 16} +---- + +Output: + +---- +11 22 2a 33 79 6f 6f 6f 7a 00 ┆ •"*3yoooz• +---- +==== + +=== Group + +A _group_ is a scoped sequence of items. + +The <> within a group aren't visible outside of it. + +The main purpose of a group is to <> more than a +single item. + +A group is: + +. The `(` prefix. + +. Zero or more items. + +. The `)` suffix. + +==== +Input: + +---- +((aa bb cc) dd () ee) "leclerc" +---- + +Output: + +---- +aa bb cc dd ee 6c 65 63 6c 65 72 63 ┆ •••••leclerc +---- +==== + +==== +Input: + +---- +((aa bb cc) * 3 dd ee) * 5 +---- + +Output: + +---- +aa bb cc aa bb cc aa bb cc dd ee aa bb cc aa bb +cc aa bb cc dd ee aa bb cc aa bb cc aa bb cc dd +ee aa bb cc aa bb cc aa bb cc dd ee aa bb cc aa +bb cc aa bb cc dd ee +---- +==== + +==== +Input: + +---- +{be} +( + u16le"sébastien diaz" + {ICITTE - str_beg : 8} + {(end - str_beg) * 5 : 24} +) * 3 + +---- + +Output: + +---- +73 00 e9 00 62 00 61 00 73 00 74 00 69 00 65 00 ┆ s•••b•a•s•t•i•e• +6e 00 20 00 64 00 69 00 61 00 7a 00 1c 00 01 e0 ┆ n• •d•i•a•z••••• +73 00 e9 00 62 00 61 00 73 00 74 00 69 00 65 00 ┆ s•••b•a•s•t•i•e• +6e 00 20 00 64 00 69 00 61 00 7a 00 1c 00 01 40 ┆ n• •d•i•a•z••••@ +73 00 e9 00 62 00 61 00 73 00 74 00 69 00 65 00 ┆ s•••b•a•s•t•i•e• +6e 00 20 00 64 00 69 00 61 00 7a 00 1c 00 00 a0 ┆ n• •d•i•a•z••••• +---- +==== + +=== Repetition + +A _repetition_ represents the bytes of an item repeated a given number +of times. + +A repetition is: + +. Any item. + +. The ``pass:[*]`` character. + +. A positive integer (hexadecimal starting with `0x` or `0X` accepted) + which is the number of times to repeat the previous item. + +==== +Input: + +---- +{end - ICITTE - 1 : 8} * 0x100 +---- + +Output: + +---- +ff fe fd fc fb fa f9 f8 f7 f6 f5 f4 f3 f2 f1 f0 ┆ •••••••••••••••• +ef ee ed ec eb ea e9 e8 e7 e6 e5 e4 e3 e2 e1 e0 ┆ •••••••••••••••• +df de dd dc db da d9 d8 d7 d6 d5 d4 d3 d2 d1 d0 ┆ •••••••••••••••• +cf ce cd cc cb ca c9 c8 c7 c6 c5 c4 c3 c2 c1 c0 ┆ •••••••••••••••• +bf be bd bc bb ba b9 b8 b7 b6 b5 b4 b3 b2 b1 b0 ┆ •••••••••••••••• +af ae ad ac ab aa a9 a8 a7 a6 a5 a4 a3 a2 a1 a0 ┆ •••••••••••••••• +9f 9e 9d 9c 9b 9a 99 98 97 96 95 94 93 92 91 90 ┆ •••••••••••••••• +8f 8e 8d 8c 8b 8a 89 88 87 86 85 84 83 82 81 80 ┆ •••••••••••••••• +7f 7e 7d 7c 7b 7a 79 78 77 76 75 74 73 72 71 70 ┆ •~}|{zyxwvutsrqp +6f 6e 6d 6c 6b 6a 69 68 67 66 65 64 63 62 61 60 ┆ onmlkjihgfedcba` +5f 5e 5d 5c 5b 5a 59 58 57 56 55 54 53 52 51 50 ┆ _^]\[ZYXWVUTSRQP +4f 4e 4d 4c 4b 4a 49 48 47 46 45 44 43 42 41 40 ┆ ONMLKJIHGFEDCBA@ +3f 3e 3d 3c 3b 3a 39 38 37 36 35 34 33 32 31 30 ┆ ?>=<;:9876543210 +2f 2e 2d 2c 2b 2a 29 28 27 26 25 24 23 22 21 20 ┆ /.-,+*)('&%$#"! +1f 1e 1d 1c 1b 1a 19 18 17 16 15 14 13 12 11 10 ┆ •••••••••••••••• +0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 ┆ •••••••••••••••• +---- +==== + +== Command-line tool + +If you <> the `normand` package, then you +can use the `normand` command-line tool: + +---- +$ normand <<< '"ma gang de malades"' | hexdump -C +---- + +---- +00000000 6d 61 20 67 61 6e 67 20 64 65 20 6d 61 6c 61 64 |ma gang de malad| +00000010 65 73 |es| +---- + +If you copy the `normand.py` module to your own project, then you can +run the module itself: + +---- +$ python3 -m normand <<< '"ma gang de malades"' | hexdump -C +---- + +---- +00000000 6d 61 20 67 61 6e 67 20 64 65 20 6d 61 6c 61 64 |ma gang de malad| +00000010 65 73 |es| +---- + +Without a path argument, the `normand` tool reads from the standard +input. + +The `normand` tool prints the generated binary data to the standard +output. + +Various options control the initial <> of the processor: +use the `--help` option to learn more. + +== {py3} API + +The whole `normand` package/module API is: + +[source,python] +---- +class ByteOrder(enum.Enum): + # Big endian. + BE = ... + + # Little endian. + LE = ... + + +VarsT = typing.Dict[str, int] + + +class TextLoc: + # Line number. + @property + def line_no(self) -> int: + ... + + # Column number. + @property + def col_no(self) -> int: + ... + + +class ParseError(RuntimeError): + # Source text location. + @property + def text_loc(self) -> TextLoc: + ... + + +class ParseResult: + # Generated data. + @property + def data(self) -> bytearray: + ... + + # Updated variable values. + @property + def variables(self) -> VarsT: + ... + + # Updated main group label values. + @property + def labels(self) -> VarsT: + ... + + # Final offset. + @property + def offset(self) -> int: + ... + + # Final byte order. + @property + def byte_order(self) -> typing.Optional[int]: + ... + +def parse(normand: str, + init_variables: typing.Optional[VarsT] = None, + init_labels: typing.Optional[VarsT] = None, + init_offset: int = 0, + init_byte_order: typing.Optional[ByteOrder] = None) -> ParseResult: + ... +---- + +The `normand` parameter is the actual <> +while the other parameters control the initial <>. + +The `parse()` function raises a `ParseError` instance should it fail to +parse the `normand` string for any reason. diff --git a/normand/__init__.py b/normand/__init__.py new file mode 100644 index 0000000..a0e9385 --- /dev/null +++ b/normand/__init__.py @@ -0,0 +1,26 @@ +# The MIT License (MIT) +# +# Copyright (c) 2023 Philippe Proulx +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from .normand import * + +del normand diff --git a/normand/normand.py b/normand/normand.py new file mode 100644 index 0000000..31c3aff --- /dev/null +++ b/normand/normand.py @@ -0,0 +1,1464 @@ +# The MIT License (MIT) +# +# Copyright (c) 2023 Philippe Proulx +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__author__ = "Philippe Proulx" +__version__ = "0.1.0" +__all__ = [ + "ByteOrder", + "parse", + "ParseError", + "ParseResult", + "TextLoc", + "VarsT", + "__author__", + "__version__", +] + +import re +import abc +import ast +import sys +import enum +import struct +from typing import Any, Dict, List, Union, Pattern, Callable, NoReturn, Optional + + +# Text location (line and column numbers). +class TextLoc: + @classmethod + def _create(cls, line_no: int, col_no: int): + self = cls.__new__(cls) + self._init(line_no, col_no) + return self + + def __init__(*args, **kwargs): # type: ignore + raise NotImplementedError + + def _init(self, line_no: int, col_no: int): + self._line_no = line_no + self._col_no = col_no + + # Line number. + @property + def line_no(self): + return self._line_no + + # Column number. + @property + def col_no(self): + return self._col_no + + +# Any item. +class _Item: + def __init__(self, text_loc: TextLoc): + self._text_loc = text_loc + + # Source text location. + @property + def text_loc(self): + return self._text_loc + + # Returns the size, in bytes, of this item. + @property + @abc.abstractmethod + def size(self) -> int: + ... + + +# A repeatable item. +class _RepableItem(_Item): + pass + + +# Single byte. +class _Byte(_RepableItem): + def __init__(self, val: int, text_loc: TextLoc): + super().__init__(text_loc) + self._val = val + + # Byte value. + @property + def val(self): + return self._val + + @property + def size(self): + return 1 + + def __repr__(self): + return "_Byte({}, {})".format(hex(self._val), self._text_loc) + + +# String. +class _Str(_RepableItem): + def __init__(self, data: bytes, text_loc: TextLoc): + super().__init__(text_loc) + self._data = data + + # Encoded bytes. + @property + def data(self): + return self._data + + @property + def size(self): + return len(self._data) + + def __repr__(self): + return "_Str({}, {})".format(repr(self._data), self._text_loc) + + +# Byte order. +@enum.unique +class ByteOrder(enum.Enum): + # Big endian. + BE = "be" + + # Little endian. + LE = "le" + + +# Byte order. +class _Bo(_Item): + def __init__(self, bo: ByteOrder): + self._bo = bo + + @property + def bo(self): + return self._bo + + @property + def size(self): + return 0 + + +# Label. +class _Label(_Item): + def __init__(self, name: str, text_loc: TextLoc): + super().__init__(text_loc) + self._name = name + + # Label name. + @property + def name(self): + return self._name + + @property + def size(self): + return 0 + + def __repr__(self): + return "_Label({}, {})".format(repr(self._name), self._text_loc) + + +# Offset. +class _Offset(_Item): + def __init__(self, val: int, text_loc: TextLoc): + super().__init__(text_loc) + self._val = val + + # Offset value. + @property + def val(self): + return self._val + + @property + def size(self): + return 0 + + def __repr__(self): + return "_Offset({}, {})".format(repr(self._val), self._text_loc) + + +# Mixin of containing an AST expression and its string. +class _ExprMixin: + def __init__(self, expr_str: str, expr: ast.Expression): + self._expr_str = expr_str + self._expr = expr + + # Expression string. + @property + def expr_str(self): + return self._expr_str + + # Expression node to evaluate. + @property + def expr(self): + return self._expr + + +# Variable. +class _Var(_Item, _ExprMixin): + def __init__( + self, name: str, expr_str: str, expr: ast.Expression, text_loc: TextLoc + ): + super().__init__(text_loc) + _ExprMixin.__init__(self, expr_str, expr) + self._name = name + + # Name. + @property + def name(self): + return self._name + + @property + def size(self): + return 0 + + def __repr__(self): + return "_Var({}, {}, {}, {})".format( + repr(self._name), repr(self._expr_str), repr(self._expr), self._text_loc + ) + + +# Value, possibly needing more than one byte. +class _Val(_RepableItem, _ExprMixin): + def __init__( + self, expr_str: str, expr: ast.Expression, len: int, text_loc: TextLoc + ): + super().__init__(text_loc) + _ExprMixin.__init__(self, expr_str, expr) + self._len = len + + # Length (bits). + @property + def len(self): + return self._len + + @property + def size(self): + return self._len // 8 + + def __repr__(self): + return "_Val({}, {}, {}, {})".format( + repr(self._expr_str), repr(self._expr), repr(self._len), self._text_loc + ) + + +# Expression item type. +_ExprItemT = Union[_Val, _Var] + + +# Group of items. +class _Group(_RepableItem): + def __init__(self, items: List[_Item], text_loc: TextLoc): + super().__init__(text_loc) + self._items = items + self._size = sum([item.size for item in self._items]) + + # Contained items. + @property + def items(self): + return self._items + + @property + def size(self): + return self._size + + def __repr__(self): + return "_Group({}, {})".format(repr(self._items), self._text_loc) + + +# Repetition item. +class _Rep(_Item): + def __init__(self, item: _RepableItem, mul: int, text_loc: TextLoc): + super().__init__(text_loc) + self._item = item + self._mul = mul + + # Item to repeat. + @property + def item(self): + return self._item + + # Repetition multiplier. + @property + def mul(self): + return self._mul + + @property + def size(self): + return self._item.size * self._mul + + def __repr__(self): + return "_Rep({}, {}, {})".format( + repr(self._item), repr(self._mul), self._text_loc + ) + + +# A parsing error containing a message and a text location. +class ParseError(RuntimeError): + @classmethod + def _create(cls, msg: str, text_loc: TextLoc): + self = cls.__new__(cls) + self._init(msg, text_loc) + return self + + def __init__(self, *args, **kwargs): # type: ignore + raise NotImplementedError + + def _init(self, msg: str, text_loc: TextLoc): + super().__init__(msg) + self._text_loc = text_loc + + # Source text location. + @property + def text_loc(self): + return self._text_loc + + +# Raises a parsing error, forwarding the parameters to the constructor. +def _raise_error(msg: str, text_loc: TextLoc) -> NoReturn: + raise ParseError._create(msg, text_loc) # pyright: ignore[reportPrivateUsage] + + +# Variable (and label) dictionary type. +VarsT = Dict[str, int] + + +# Python name pattern. +_py_name_pat = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") + + +# Normand parser. +# +# The constructor accepts a Normand input. After building, use the `res` +# property to get the resulting main group. +class _Parser: + # Builds a parser to parse the Normand input `normand`, parsing + # immediately. + def __init__(self, normand: str, variables: VarsT, labels: VarsT): + self._normand = normand + self._at = 0 + self._line_no = 1 + self._col_no = 1 + self._label_names = set(labels.keys()) + self._var_names = set(variables.keys()) + self._parse() + + # Result (main group). + @property + def res(self): + return self._res + + # Current text location. + @property + def _text_loc(self): + return TextLoc._create( # pyright: ignore[reportPrivateUsage] + self._line_no, self._col_no + ) + + # Returns `True` if this parser is done parsing. + def _is_done(self): + return self._at == len(self._normand) + + # Returns `True` if this parser isn't done parsing. + def _isnt_done(self): + return not self._is_done() + + # Raises a parse error, creating it using the message `msg` and the + # current text location. + def _raise_error(self, msg: str) -> NoReturn: + _raise_error(msg, self._text_loc) + + # Tries to make the pattern `pat` match the current substring, + # returning the match object and updating `self._at`, + # `self._line_no`, and `self._col_no` on success. + def _try_parse_pat(self, pat: Pattern[str]): + m = pat.match(self._normand, self._at) + + if m is None: + return + + # Skip matched string + self._at += len(m.group(0)) + + # Update line number + self._line_no += m.group(0).count("\n") + + # Update column number + for i in reversed(range(self._at)): + if self._normand[i] == "\n" or i == 0: + if i == 0: + self._col_no = self._at + 1 + else: + self._col_no = self._at - i + + break + + # Return match object + return m + + # Expects the pattern `pat` to match the current substring, + # returning the match object and updating `self._at`, + # `self._line_no`, and `self._col_no` on success, or raising a parse + # error with the message `error_msg` on error. + def _expect_pat(self, pat: Pattern[str], error_msg: str): + # Match + m = self._try_parse_pat(pat) + + if m is None: + # No match: error + self._raise_error(error_msg) + + # Return match object + return m + + # Pattern for _skip_ws_and_comments() + _ws_or_syms_or_comments_pat = re.compile( + r"(?:[\s!@/\\?&:;.,+[\]_=|-]|#[^#]*?(?:\n|#))*" + ) + + # Skips as many whitespaces, insignificant symbol characters, and + # comments as possible. + def _skip_ws_and_comments(self): + self._try_parse_pat(self._ws_or_syms_or_comments_pat) + + # Pattern for _try_parse_hex_byte() + _nibble_pat = re.compile(r"[A-Fa-f0-9]") + + # Tries to parse a hexadecimal byte, returning a byte item on + # success. + def _try_parse_hex_byte(self): + # Match initial nibble + m_high = self._try_parse_pat(self._nibble_pat) + + if m_high is None: + # No match + return + + # Expect another nibble + self._skip_ws_and_comments() + m_low = self._expect_pat( + self._nibble_pat, "Expecting another hexadecimal nibble" + ) + + # Return item + return _Byte(int(m_high.group(0) + m_low.group(0), 16), self._text_loc) + + # Patterns for _try_parse_bin_byte() + _bin_byte_bit_pat = re.compile(r"[01]") + _bin_byte_prefix_pat = re.compile(r"%") + + # Tries to parse a binary byte, returning a byte item on success. + def _try_parse_bin_byte(self): + # Match prefix + if self._try_parse_pat(self._bin_byte_prefix_pat) is None: + # No match + return + + # Expect eight bits + bits = [] # type: List[str] + + for _ in range(8): + self._skip_ws_and_comments() + m = self._expect_pat(self._bin_byte_bit_pat, "Expecting a bit (`0` or `1`)") + bits.append(m.group(0)) + + # Return item + return _Byte(int("".join(bits), 2), self._text_loc) + + # Patterns for _try_parse_dec_byte() + _dec_byte_prefix_pat = re.compile(r"\$\s*") + _dec_byte_val_pat = re.compile(r"(?P-?)(?P\d+)") + + # Tries to parse a decimal byte, returning a byte item on success. + def _try_parse_dec_byte(self): + # Match prefix + if self._try_parse_pat(self._dec_byte_prefix_pat) is None: + # No match + return + + # Expect the value + m = self._expect_pat(self._dec_byte_val_pat, "Expecting a decimal constant") + + # Compute value + val = int(m.group("val")) * (-1 if m.group("neg") == "-" else 1) + + # Validate + if val < -128 or val > 255: + self._raise_error("Invalid decimal byte value {}".format(val)) + + # Two's complement + val = val % 256 + + # Return item + return _Byte(val, self._text_loc) + + # Tries to parse a byte, returning a byte item on success. + def _try_parse_byte(self): + # Hexadecimal + item = self._try_parse_hex_byte() + + if item is not None: + return item + + # Binary + item = self._try_parse_bin_byte() + + if item is not None: + return item + + # Decimal + item = self._try_parse_dec_byte() + + if item is not None: + return item + + # Patterns for _try_parse_str() + _str_prefix_pat = re.compile(r'(?:u(?P16|32)(?Pbe|le))?\s*"') + _str_suffix_pat = re.compile(r'"') + _str_str_pat = re.compile(r'(?:(?:\\.)|[^"])*') + + # Strings corresponding to escape sequence characters + _str_escape_seq_strs = { + "0": "\0", + "a": "\a", + "b": "\b", + "e": "\x1b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", + "\\": "\\", + '"': '"', + } + + # Tries to parse a string, returning a string item on success. + def _try_parse_str(self): + # Match prefix + m = self._try_parse_pat(self._str_prefix_pat) + + if m is None: + # No match + return + + # Get encoding + encoding = "utf8" + + if m.group("len") is not None: + encoding = "utf_{}_{}".format(m.group("len"), m.group("bo")) + + # Actual string + m = self._expect_pat(self._str_str_pat, "Expecting a literal string") + + # Expect end of string + self._expect_pat(self._str_suffix_pat, 'Expecting `"` (end of literal string)') + + # Replace escape sequences + val = m.group(0) + + for ec in '0abefnrtv"\\': + val = val.replace(r"\{}".format(ec), self._str_escape_seq_strs[ec]) + + # Encode + data = val.encode(encoding) + + # Return item + return _Str(data, self._text_loc) + + # Patterns for _try_parse_group() + _group_prefix_pat = re.compile(r"\(") + _group_suffix_pat = re.compile(r"\)") + + # Tries to parse a group, returning a group item on success. + def _try_parse_group(self): + # Match prefix + if self._try_parse_pat(self._group_prefix_pat) is None: + # No match + return + + # Parse items + items = self._parse_items() + + # Expect end of group + self._skip_ws_and_comments() + self._expect_pat( + self._group_suffix_pat, "Expecting an item or `)` (end of group)" + ) + + # Return item + return _Group(items, self._text_loc) + + # Returns a stripped expression string and an AST expression node + # from the expression string `expr_str` at text location `text_loc`. + def _ast_expr_from_str(self, expr_str: str, text_loc: TextLoc): + # Create an expression node from the expression string + expr_str = expr_str.strip().replace("\n", " ") + + try: + expr = ast.parse(expr_str, mode="eval") + except SyntaxError: + _raise_error( + "Invalid expression `{}`: invalid syntax".format(expr_str), + text_loc, + ) + + return expr_str, expr + + # Patterns for _try_parse_val_and_len() + _val_expr_pat = re.compile(r"([^}:]+):") + _val_len_pat = re.compile(r"\s*(8|16|24|32|40|48|56|64)") + + # Tries to parse a value and length, returning a value item on + # success. + def _try_parse_val_and_len(self): + begin_text_loc = self._text_loc + + # Match + m_expr = self._try_parse_pat(self._val_expr_pat) + + if m_expr is None: + # No match + return + + # Expect a length + m_len = self._expect_pat( + self._val_len_pat, "Expecting a length (multiple of eight bits)" + ) + + # Create an expression node from the expression string + expr_str, expr = self._ast_expr_from_str(m_expr.group(1), begin_text_loc) + + # Return item + return _Val( + expr_str, + expr, + int(m_len.group(1)), + self._text_loc, + ) + + # Patterns for _try_parse_val_and_len() + _var_pat = re.compile( + r"(?P{})\s*=\s*(?P[^}}]+)".format(_py_name_pat.pattern) + ) + + # Tries to parse a variable, returning a variable item on success. + def _try_parse_var(self): + begin_text_loc = self._text_loc + + # Match + m = self._try_parse_pat(self._var_pat) + + if m is None: + # No match + return + + # Validate name + name = m.group("name") + + if name == _icitte_name: + self._raise_error("`{}` is a reserved variable name".format(_icitte_name)) + + if name in self._label_names: + self._raise_error("Existing label named `{}`".format(name)) + + # Add to known variable names + self._var_names.add(name) + + # Create an expression node from the expression string + expr_str, expr = self._ast_expr_from_str(m.group("expr"), begin_text_loc) + + # Return item + return _Var( + name, + expr_str, + expr, + self._text_loc, + ) + + # Pattern for _try_parse_bo_name() + _bo_pat = re.compile(r"[bl]e") + + # Tries to parse a byte order name, returning a byte order item on + # success. + def _try_parse_bo_name(self): + # Match + m = self._try_parse_pat(self._bo_pat) + + if m is None: + # No match + return + + # Return corresponding item + if m.group(0) == "be": + return _Bo(ByteOrder.BE) + else: + assert m.group(0) == "le" + return _Bo(ByteOrder.LE) + + # Patterns for _try_parse_val_or_bo() + _val_var_bo_prefix_pat = re.compile(r"\{\s*") + _val_var_bo_suffix_pat = re.compile(r"\s*}") + + # Tries to parse a value, a variable, or a byte order, returning an + # item on success. + def _try_parse_val_or_var_or_bo(self): + # Match prefix + if self._try_parse_pat(self._val_var_bo_prefix_pat) is None: + # No match + return + + # Variable item? + item = self._try_parse_var() + + if item is None: + # Value item? + item = self._try_parse_val_and_len() + + if item is None: + # Byte order item? + item = self._try_parse_bo_name() + + if item is None: + # At this point it's invalid + self._raise_error("Expecting a value, a variable, or a byte order") + + # Expect suffix + self._expect_pat(self._val_var_bo_suffix_pat, "Expecting `}`") + return item + + # Pattern for _try_parse_offset_val() and _try_parse_rep() + _pos_const_int_pat = re.compile(r"0[Xx][A-Fa-f0-9]+|\d+") + + # Tries to parse an offset value (after the initial `<`), returning + # an offset item on success. + def _try_parse_offset_val(self): + # Match + m = self._try_parse_pat(self._pos_const_int_pat) + + if m is None: + # No match + return + + # Return item + return _Offset(int(m.group(0), 0), self._text_loc) + + # Tries to parse a label name (after the initial `<`), returning a + # label item on success. + def _try_parse_label_name(self): + # Match + m = self._try_parse_pat(_py_name_pat) + + if m is None: + # No match + return + + # Validate + name = m.group(0) + + if name == _icitte_name: + self._raise_error("`{}` is a reserved label name".format(_icitte_name)) + + if name in self._label_names: + self._raise_error("Duplicate label name `{}`".format(name)) + + if name in self._var_names: + self._raise_error("Existing variable named `{}`".format(name)) + + # Add to known label names + self._label_names.add(name) + + # Return item + return _Label(name, self._text_loc) + + # Patterns for _try_parse_label_or_offset() + _label_offset_prefix_pat = re.compile(r"<\s*") + _label_offset_suffix_pat = re.compile(r"\s*>") + + # Tries to parse a label or an offset, returning an item on success. + def _try_parse_label_or_offset(self): + # Match prefix + if self._try_parse_pat(self._label_offset_prefix_pat) is None: + # No match + return + + # Offset item? + item = self._try_parse_offset_val() + + if item is None: + # Label item? + item = self._try_parse_label_name() + + if item is None: + # At this point it's invalid + self._raise_error("Expecting a label name or an offset value") + + # Expect suffix + self._expect_pat(self._label_offset_suffix_pat, "Expecting `>`") + return item + + # Tries to parse a base item (anything except a repetition), + # returning it on success. + def _try_parse_base_item(self): + # Byte item? + item = self._try_parse_byte() + + if item is not None: + return item + + # String item? + item = self._try_parse_str() + + if item is not None: + return item + + # Value, variable, or byte order item? + item = self._try_parse_val_or_var_or_bo() + + if item is not None: + return item + + # Label or offset item? + item = self._try_parse_label_or_offset() + + if item is not None: + return item + + # Group item? + item = self._try_parse_group() + + if item is not None: + return item + + # Pattern for _try_parse_rep() + _rep_prefix_pat = re.compile(r"\*\s*") + + # Tries to parse a repetition, returning the multiplier on success, + # or 1 otherwise. + def _try_parse_rep(self): + self._skip_ws_and_comments() + + # Match prefix + if self._try_parse_pat(self._rep_prefix_pat) is None: + # No match + return 1 + + # Expect and return a decimal multiplier + self._skip_ws_and_comments() + m = self._expect_pat( + self._pos_const_int_pat, "Expecting a positive integral multiplier" + ) + return int(m.group(0), 0) + + # Tries to parse a repeatable item followed or not by a repetition, + # returning an item on success. + def _try_parse_item(self): + self._skip_ws_and_comments() + + # Parse a base item + item = self._try_parse_base_item() + + if item is None: + # No item + return + + # Parse repetition if the base item is repeatable + if isinstance(item, _RepableItem): + rep = self._try_parse_rep() + + if rep == 0: + # No item + return + elif rep > 1: + # Convert to repetition item + item = _Rep(item, rep, self._text_loc) + + return item + + # Parses and returns items, skipping whitespaces, insignificant + # symbols, and comments when allowed, and stopping at the first + # unknown character. + def _parse_items(self) -> List[_Item]: + items = [] # type: List[_Item] + + while self._isnt_done(): + # Try to parse item + item = self._try_parse_item() + + if item is not None: + # Append new item + items.append(item) + continue + + # Unknown at this point + break + + return items + + # Parses the whole Normand input, setting `self._res` to the main + # group item on success. + def _parse(self): + if len(self._normand.strip()) == 0: + # Special case to make sure there's something to consume + self._res = _Group([], self._text_loc) + return + + # Parse first level items + items = self._parse_items() + + # Make sure there's nothing left + self._skip_ws_and_comments() + + if self._isnt_done(): + self._raise_error( + "Unexpected character `{}`".format(self._normand[self._at]) + ) + + # Set main group item + self._res = _Group(items, self._text_loc) + + +# The return type of parse(). +class ParseResult: + @classmethod + def _create( + cls, + data: bytearray, + variables: VarsT, + labels: VarsT, + offset: int, + bo: Optional[ByteOrder], + ): + self = cls.__new__(cls) + self._init(data, variables, labels, offset, bo) + return self + + def __init__(self, *args, **kwargs): # type: ignore + raise NotImplementedError + + def _init( + self, + data: bytearray, + variables: VarsT, + labels: VarsT, + offset: int, + bo: Optional[ByteOrder], + ): + self._data = data + self._vars = variables + self._labels = labels + self._offset = offset + self._bo = bo + + # Generated data. + @property + def data(self): + return self._data + + # Dictionary of updated variable names to their last computed value. + @property + def variables(self): + return self._vars + + # Dictionary of updated main group label names to their computed + # value. + @property + def labels(self): + return self._labels + + # Updated offset. + @property + def offset(self): + return self._offset + + # Updated byte order. + @property + def byte_order(self): + return self._bo + + +# Raises a parse error for the item `item`, creating it using the +# message `msg`. +def _raise_error_for_item(msg: str, item: _Item) -> NoReturn: + _raise_error(msg, item.text_loc) + + +# The `ICITTE` reserved name. +_icitte_name = "ICITTE" + + +# Value expression validator. +class _ExprValidator(ast.NodeVisitor): + def __init__(self, item: _ExprItemT, syms: VarsT): + self._item = item + self._syms = syms + self._parent_is_call = False + + def generic_visit(self, node: ast.AST): + if type(node) is ast.Call: + self._parent_is_call = True + elif type(node) is ast.Name and not self._parent_is_call: + # Make sure the name refers to a known label name + if node.id != _icitte_name and node.id not in self._syms: + _raise_error( + "Unknown variable/label name `{}` in expression `{}`".format( + node.id, self._item.expr_str + ), + self._item.text_loc, + ) + + # TODO: Restrict the set of allowed node types + + super().generic_visit(node) + self._parent_is_call = False + + +# Keeper of labels for a given group instance. +# +# A group instance is one iteration of a given group. +class _GroupInstanceLabels: + def __init__(self): + self._instance_labels = {} # type: Dict[_Group, Dict[int, VarsT]] + + # Assigns the labels `labels` to a new instance of `group`. + def add(self, group: _Group, labels: VarsT): + if group not in self._instance_labels: + self._instance_labels[group] = {} + + spec_instance_labels = self._instance_labels[group] + spec_instance_labels[len(spec_instance_labels)] = labels.copy() + + # Returns the labels (not a copy) of the instance `instance_index` + # of the group `group`. + def labels(self, group: _Group, instance_index: int): + return self._instance_labels[group][instance_index] + + +# Generator of data and labels from a group item. +# +# Generation happens in memory at construction time. After building, use +# the `data`, `variables`, `labels`, `offset`, and `bo` properties to +# get the resulting context. +class _Gen: + def __init__( + self, + group: _Group, + variables: VarsT, + labels: VarsT, + offset: int, + bo: Optional[ByteOrder], + ): + self._group_instance_labels = _GroupInstanceLabels() + self._resolve_labels(group, offset, labels.copy()) + self._vars = variables.copy() + self._offset = offset + self._bo = bo + self._main_group = group + self._gen() + + # Generated bytes. + @property + def data(self): + return self._data + + # Updated variables. + @property + def variables(self): + return self._vars + + # Updated main group labels. + @property + def labels(self): + return self._group_instance_labels.labels(self._main_group, 0) + + # Updated offset. + @property + def offset(self): + return self._offset + + # Updated byte order. + @property + def bo(self): + return self._bo + + # Fills `self._group_instance_labels` with the labels for each group + # instance in `item`, starting at current offset `offset` with the + # current labels `labels`. + # + # Returns the new current offset. + def _resolve_labels(self, item: _Item, offset: int, labels: VarsT) -> int: + if type(item) is _Group: + # First pass: compute immediate labels of this instance + group_labels = labels.copy() + group_offset = offset + + for subitem in item.items: + if type(subitem) is _Offset: + group_offset = subitem.val + elif type(subitem) is _Label: + assert subitem.name not in group_labels + group_labels[subitem.name] = group_offset + else: + group_offset += subitem.size + + # Add to group instance labels + self._group_instance_labels.add(item, group_labels) + + # Second pass: handle each item + for subitem in item.items: + offset = self._resolve_labels(subitem, offset, group_labels) + elif type(item) is _Rep: + for _ in range(item.mul): + offset = self._resolve_labels(item.item, offset, labels) + elif type(item) is _Offset: + offset = item.val + else: + offset += item.size + + return offset + + def _handle_byte_item(self, item: _Byte): + self._data.append(item.val) + self._offset += item.size + + def _handle_str_item(self, item: _Str): + self._data += item.data + self._offset += item.size + + def _handle_bo_item(self, item: _Bo): + self._bo = item.bo + + def _eval_expr(self, item: _ExprItemT): + # Get the labels of the current group instance as the initial + # symbols (copied because we're adding stuff). + assert self._cur_group is not None + syms = self._group_instance_labels.labels( + self._cur_group, self._group_instance_indexes[self._cur_group] + ).copy() + + # Set the `ICITTE` name to the current offset (before encoding) + syms[_icitte_name] = self._offset + + # Add the current variables + syms.update(self._vars) + + # Validate the node and its children + _ExprValidator(item, syms).visit(item.expr) + + # Compile and evaluate expression node + try: + val = eval(compile(item.expr, "", "eval"), None, syms) + except Exception as exc: + _raise_error_for_item( + "Failed to evaluate expression `{}`: {}".format(item.expr_str, exc), + item, + ) + + # Validate result + if type(val) is not int: + _raise_error_for_item( + "Invalid expression `{}`: unexpected result type `{}`".format( + item.expr_str, type(val).__name__ + ), + item, + ) + + return val + + def _handle_var_item(self, item: _Var): + # Update variable + self._vars[item.name] = self._eval_expr(item) + + def _handle_val_item(self, item: _Val): + # Compute value + val = self._eval_expr(item) + + # Validate range + if val < -(2 ** (item.len - 1)) or val > 2**item.len - 1: + _raise_error_for_item( + "Value {:,} is outside the {}-bit range when evaluating expression `{}` at byte offset {:,}".format( + val, item.len, item.expr_str, self._offset + ), + item, + ) + + # Encode result on 64 bits (to extend the sign bit whatever the + # value of `item.len`). + if self._bo is None and item.len > 8: + _raise_error_for_item( + "Current byte order isn't defined at first value (`{}`) to encode on more than 8 bits".format( + item.expr_str + ), + item, + ) + + data = struct.pack( + "{}{}".format( + ">" if self._bo in (None, ByteOrder.BE) else "<", + "Q" if val >= 0 else "q", + ), + val, + ) + + # Keep only the requested length + len_bytes = item.len // 8 + + if self._bo in (None, ByteOrder.BE): + # Big endian: keep last bytes + data = data[-len_bytes:] + else: + # Little endian: keep first bytes + assert self._bo == ByteOrder.LE + data = data[:len_bytes] + + # Append to current bytes and update offset + self._data += data + self._offset += len(data) + + def _handle_group_item(self, item: _Group): + # Update the instance index of `item` + if item not in self._group_instance_indexes: + self._group_instance_indexes[item] = 0 + else: + self._group_instance_indexes[item] += 1 + + # Changed current group + old_cur_group = self._cur_group + self._cur_group = item + + # Handle each item + for subitem in item.items: + self._handle_item(subitem) + + # Restore current group + self._cur_group = old_cur_group + + def _handle_rep_item(self, item: _Rep): + for _ in range(item.mul): + self._handle_item(item.item) + + def _handle_offset_item(self, item: _Offset): + self._offset = item.val + + def _handle_item(self, item: _Item): + if type(item) in self._item_handlers: + self._item_handlers[type(item)](item) + + def _gen(self): + # Initial state + self._data = bytearray() + self._group_instance_indexes = {} # type: Dict[_Group, int] + self._cur_group = None + + # Item handlers + self._item_handlers = { + _Byte: self._handle_byte_item, + _Str: self._handle_str_item, + _Bo: self._handle_bo_item, + _Val: self._handle_val_item, + _Var: self._handle_var_item, + _Group: self._handle_group_item, + _Rep: self._handle_rep_item, + _Offset: self._handle_offset_item, + } # type: Dict[type, Callable[[Any], None]] + + # Handle the group item + self._handle_item(self._main_group) + + +# Returns a `ParseResult` instance containing the bytes encoded by the +# input string `normand`. +# +# `init_variables` is a dictionary of initial variable names (valid +# Python names) to integral values. A variable name must not be the +# reserved name `ICITTE`. +# +# `init_labels` is a dictionary of initial label names (valid Python +# names) to integral values. A label name must not be the reserved name +# `ICITTE`. +# +# `init_offset` is the initial offset. +# +# `init_byte_order` is the initial byte order. +# +# Raises `ParseError` on any parsing error. +def parse( + normand: str, + init_variables: Optional[VarsT] = None, + init_labels: Optional[VarsT] = None, + init_offset: int = 0, + init_byte_order: Optional[ByteOrder] = None, +): + if init_variables is None: + init_variables = {} + + if init_labels is None: + init_labels = {} + + gen = _Gen( + _Parser(normand, init_variables, init_labels).res, + init_variables, + init_labels, + init_offset, + init_byte_order, + ) + return ParseResult._create( # pyright: ignore[reportPrivateUsage] + gen.data, gen.variables, gen.labels, gen.offset, gen.bo + ) + + +# Parses the command-line arguments. +def _parse_cli_args(): + import argparse + + # Build parser + ap = argparse.ArgumentParser() + ap.add_argument( + "--offset", + metavar="OFFSET", + action="store", + type=int, + default=0, + help="initial offset (positive)", + ) + ap.add_argument( + "-b", + "--byte-order", + metavar="BO", + choices=["be", "le"], + type=str, + help="initial byte order (`be` or `le`)", + ) + ap.add_argument( + "--var", + metavar="NAME=VAL", + action="append", + help="add an initial variable (may be repeated)", + ) + ap.add_argument( + "-l", + "--label", + metavar="NAME=VAL", + action="append", + help="add an initial label (may be repeated)", + ) + ap.add_argument( + "--version", action="version", version="Normand {}".format(__version__) + ) + ap.add_argument( + "path", + metavar="PATH", + action="store", + nargs="?", + help="input path (none means standard input)", + ) + + # Parse + return ap.parse_args() + + +# Raises a command-line error with the message `msg`. +def _raise_cli_error(msg: str) -> NoReturn: + raise RuntimeError("Command-line error: {}".format(msg)) + + +# Returns a dictionary of string to integers from the list of strings +# `args` containing `NAME=VAL` entries. +def _dict_from_arg(args: Optional[List[str]]): + d = {} # type: Dict[str, int] + + if args is None: + return d + + for arg in args: + m = re.match(r"({})=(\d+)$".format(_py_name_pat.pattern), arg) + + if m is None: + _raise_cli_error("Invalid assignment {}".format(arg)) + + return d + + +# CLI entry point without exception handling. +def _try_run_cli(): + import os.path + + # Parse arguments + args = _parse_cli_args() + + # Read input + if args.path is None: + normand = sys.stdin.read() + else: + with open(args.path) as f: + normand = f.read() + + # Variables and labels + variables = _dict_from_arg(args.var) + labels = _dict_from_arg(args.label) + + # Validate offset + if args.offset < 0: + _raise_cli_error("Invalid negative offset {}") + + # Validate and set byte order + bo = None # type: Optional[ByteOrder] + + if args.byte_order is not None: + if args.byte_order == "be": + bo = ByteOrder.BE + else: + assert args.byte_order == "le" + bo = ByteOrder.LE + + # Parse + try: + res = parse(normand, variables, labels, args.offset, bo) + except ParseError as exc: + prefix = "" + + if args.path is not None: + prefix = "{}:".format(os.path.abspath(args.path)) + + _fail( + "{}{}:{} - {}".format( + prefix, exc.text_loc.line_no, exc.text_loc.col_no, str(exc) + ) + ) + + # Print + sys.stdout.buffer.write(res.data) + + +# Prints the exception message `msg` and exits with status 1. +def _fail(msg: str) -> NoReturn: + if not msg.endswith("."): + msg += "." + + print(msg, file=sys.stderr) + sys.exit(1) + + +# CLI entry point. +def _run_cli(): + try: + _try_run_cli() + except Exception as exc: + _fail(str(exc)) + + +if __name__ == "__main__": + _run_cli() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..da6e1e4 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. +package = [] + +[metadata] +lock-version = "2.0" +python-versions = "^3.4" +content-hash = "82b4465263ab9f51f0abde5f9fadb41167e70f3b305911d53fbf8018f78740d9" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ae8c1b3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,66 @@ +# The MIT License (MIT) +# +# Copyright (c) 2023 Philippe Proulx +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +[tool.poetry] +name = 'normand' +version = '0.1.0' +description = 'Text-to-binary processor with its own language' +license = 'MIT' +authors = ['Philippe Proulx '] +repository = 'https://github.com/efficios/normand' +keywords = [ + 'normand', + 'binary', +] +classifiers = [ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Topic :: Software Development :: Compilers', +] +packages = [{include = 'normand'}] + +[tool.poetry.dependencies] +python = '^3.4' + +[tool.poetry.scripts] +normand = 'normand.normand:_run_cli' + +[tool.poetry.urls] +'Bug tracker' = 'https://github.com/efficios/normand/issues' +'Code review' = 'https://review.lttng.org/admin/repos/normand' + +[tool.isort] +profile = 'black' +length_sort = true + +[tool.pyright] +typeCheckingMode = 'strict' +reportTypeCommentUsage = false + +[build-system] +requires = ['poetry-core'] +build-backend = 'poetry.core.masonry.api' -- 2.34.1