Support full reconstruction of HCL from parse tree (#169)

amplify-education · Oct 7, 2024 · 1dff4d7 · 1dff4d7
1 parent 92f2ee2
commit 1dff4d7
Show file tree

Hide file tree

Showing 7 changed files with 334 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -121,3 +121,6 @@ node_modules/
 # Don't commit the generated parser
 lark_parser.py
 .lark_cache.bin
+
+# ASDF tool-versions file
+.tool-versions
diff --git a/hcl2/__init__.py b/hcl2/__init__.py
@@ -5,4 +5,4 @@
 except ImportError:
     __version__ = "unknown"
 
-from .api import load, loads
+from .api import load, loads, parse, parses, transform, writes, AST
diff --git a/hcl2/api.py b/hcl2/api.py
@@ -1,6 +1,7 @@
 """The API that will be exposed to users of this package"""
 from typing import TextIO
 
+from lark.tree import Tree as AST
 from hcl2.parser import hcl2
 from hcl2.transformer import DictTransformer
 
@@ -26,3 +27,43 @@ def loads(text: str, with_meta=False) -> dict:
     # Append a new line as a temporary fix
     tree = hcl2.parse(text + "\n")
     return DictTransformer(with_meta=with_meta).transform(tree)
+
+
+def parse(file: TextIO) -> AST:
+    """Load HCL2 syntax tree from a file.
+    :param file: File with hcl2 to be loaded as a dict.
+    """
+    return parses(file.read())
+
+
+def parses(text: str) -> AST:
+    """Load HCL2 syntax tree from a string.
+    :param text: Text with hcl2 to be loaded as a dict.
+    """
+    # defer this import until this method is called, due to the performance hit
+    # of rebuilding the grammar without cache
+    from hcl2.reconstructor import (  # pylint: disable=import-outside-toplevel
+        hcl2 as uncached_hcl2,
+    )
+
+    return uncached_hcl2.parse(text)
+
+
+def transform(ast: AST, with_meta=False) -> dict:
+    """Convert an HCL2 AST to a dictionary.
+    :param ast: HCL2 syntax tree, output from `parse` or `parses`
+    """
+    return DictTransformer(with_meta=with_meta).transform(ast)
+
+
+def writes(ast: AST) -> str:
+    """Convert an HCL2 syntax tree to a string.
+    :param ast: HCL2 syntax tree, output from `parse` or `parses`
+    """
+    # defer this import until this method is called, due to the performance hit
+    # of rebuilding the grammar without cache
+    from hcl2.reconstructor import (  # pylint: disable=import-outside-toplevel
+        hcl2_reconstructor,
+    )
+
+    return hcl2_reconstructor.reconstruct(ast)
diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark
@@ -1,11 +1,13 @@
 start : body
 body : (new_line_or_comment? (attribute | block))* new_line_or_comment?
-attribute : identifier "=" expression
+attribute : identifier EQ expression
 block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}"
 new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment
-new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ )+
+new_line_or_comment: ( NL_OR_COMMENT )+
+NL_OR_COMMENT: /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/
 
-identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH
+identifier : NAME | IN | FOR | IF | FOR_EACH
+NAME : /[a-zA-Z_][a-zA-Z0-9_-]*/
 IF : "if"
 IN : "in"
 FOR : "for"
@@ -18,8 +20,9 @@ conditional : expression "?" new_line_or_comment? expression new_line_or_comment
 ?operation : unary_op | binary_op
 !unary_op : ("-" | "!") expr_term
 binary_op : expression binary_term new_line_or_comment?
-!binary_operator : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
+!binary_operator : BINARY_OP
 binary_term : binary_operator new_line_or_comment? expression
+BINARY_OP : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
 
 expr_term : "(" new_line_or_comment? expression new_line_or_comment? ")"
             | float_lit
@@ -50,10 +53,12 @@ int_lit : DECIMAL+
             | DECIMAL+ ("." DECIMAL+)? EXP_MARK DECIMAL+
 DECIMAL : "0".."9"
 EXP_MARK : ("e" | "E") ("+" | "-")?
+EQ : /[ \t]*=(?!=|>)/
 
 tuple : "[" (new_line_or_comment* expression new_line_or_comment* ",")* (new_line_or_comment* expression)? new_line_or_comment* "]"
 object : "{" new_line_or_comment? (object_elem (new_line_and_or_comma object_elem )* new_line_and_or_comma?)? "}"
-object_elem : (identifier | expression) ("=" | ":") expression
+object_elem : (identifier | expression) ( EQ | ":") expression
+
 
 heredoc_template : /<<(?P<heredoc>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc)/
 heredoc_template_trim : /<<-(?P<heredoc_trim>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc_trim)/
@@ -78,4 +83,3 @@ full_splat : "[*]" (get_attr | index)*
 !for_cond : "if" new_line_or_comment? expression
 
 %ignore /[ \t]+/
-%ignore /\/\*(.|\n)*?(\*\/)/
diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py
@@ -0,0 +1,162 @@
+"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality"""
+
+from lark import Lark
+from lark.reconstruct import Reconstructor
+from lark.utils import is_id_continue
+
+# this is duplicated from `parser` because we need different options here for
+# the reconstructor. please make sure changes are kept in sync between the two
+# if necessary.
+hcl2 = Lark.open(
+    "hcl2.lark",
+    parser="lalr",
+    # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed:
+    #
+    #   https://github.com/lark-parser/lark/issues/1472
+    #
+    # cache=str(PARSER_FILE),  # Disable/Delete file to effect changes to the grammar
+    rel_to=__file__,
+    propagate_positions=True,
+    maybe_placeholders=False,  # Needed for reconstruction
+)
+
+CHAR_SPACE_AFTER = set(',~@<>="|?)]:')
+CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'")
+KEYWORDS_SPACE_AFTER = [
+    "if",
+    "in",
+    "for",
+    "for_each",
+    "==",
+    "!=",
+    "<",
+    ">",
+    "<=",
+    ">=",
+    "-",
+    "*",
+    "/",
+    "%",
+    "&&",
+    "||",
+    "+",
+]
+KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER
+DIGITS = set("0123456789")
+NEVER_SPACE_AFTER = set("[(")
+NEVER_SPACE_BEFORE = set("]),.")
+NEVER_COMMA_BEFORE = set("])}")
+# characters that are OK to come right after an identifier with no space between
+IDENT_NO_SPACE = set("()[]")
+
+
+def _add_extra_space(prev_item, item):
+    # pylint: disable=too-many-boolean-expressions, too-many-return-statements
+
+    ##### the scenarios where explicitly disallow spaces: #####
+
+    # if we already have a space, don't add another
+    if prev_item[-1].isspace() or item[0].isspace():
+        return False
+
+    # none of the following should be separated by spaces:
+    # - groups of digits
+    # - namespaced::function::calls
+    # - characters within an identifier like array[0]()
+    if (
+        (prev_item[-1] in DIGITS and item[0] in DIGITS)
+        or item == "::"
+        or prev_item == "::"
+        or (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE)
+    ):
+        return False
+
+    # specific characters are also blocklisted from having spaces
+    if prev_item[-1] in NEVER_SPACE_AFTER or item[0] in NEVER_SPACE_BEFORE:
+        return False
+
+    ##### the scenarios where we add spaces: #####
+
+    # scenario 1, the prev token ended with an identifier character
+    # and the next character is not an "IDENT_NO_SPACE" character
+    if is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE:
+        return True
+
+    # scenario 2, the prev token or the next token should be followed by a space
+    if (
+        prev_item[-1] in CHAR_SPACE_AFTER
+        or prev_item in KEYWORDS_SPACE_AFTER
+        or item[0] in CHAR_SPACE_BEFORE
+        or item in KEYWORDS_SPACE_BEFORE
+    ):
+        return True
+
+    # scenario 3, the previous token was a block opening brace and
+    # the next token is not a closing brace (so the block is on one
+    # line and not empty)
+    if prev_item[-1] == "{" and item[0] != "}":
+        return True
+
+    ##### otherwise, we don't add a space #####
+    return False
+
+
+def _postprocess_reconstruct(items):
+    """
+    Postprocess the stream of tokens derived from the AST during reconstruction.
+
+    For HCL2, this is used exclusively for adding whitespace in the right locations.
+    """
+    prev_item = ""
+    for item in items:
+        # first, handle any deferred tokens
+        if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+            prev_item = prev_item[1]
+
+            # if the deferred token was a comma, see if we're ending a block
+            if prev_item == ",":
+                if item[0] not in NEVER_COMMA_BEFORE:
+                    yield prev_item
+            else:
+                yield prev_item
+
+        # if we're between two tokens, determine if we need to add an extra space
+        # we need the previous item and the current item to exist to evaluate these rules
+        if prev_item and item and _add_extra_space(prev_item, item):
+            yield " "
+
+        # in some cases, we may want to defer printing the next token
+        defer_item = False
+
+        # prevent the inclusion of extra commas if they are not intended
+        if item[0] == ",":
+            item = ("_deferred", item)
+            defer_item = True
+
+        # print the actual token
+        if not defer_item:
+            yield item
+
+        # store the previous item for the next token
+        prev_item = item
+
+    # if the last token was deferred, print it before continuing
+    if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+        yield prev_item[1]
+
+
+class HCLReconstructor:
+    """This class converts a Lark.Tree AST back into a string representing the underlying HCL code."""
+    def __init__(self, parser):
+        self._recons = Reconstructor(parser)
+
+    def reconstruct(self, tree):
+        """Convert a Lark.Tree AST back into a string representation of HCL."""
+        return self._recons.reconstruct(
+            tree,
+            _postprocess_reconstruct,
+            insert_spaces=False,
+        )
+
+
+hcl2_reconstructor = HCLReconstructor(hcl2)
diff --git a/hcl2/transformer.py b/hcl2/transformer.py
@@ -93,7 +93,10 @@ def object_elem(self, args: List) -> Dict:
         # This returns a dict with a single key/value pair to make it easier to merge these
         # into a bigger dict that is returned by the "object" function
         key = self.strip_quotes(args[0])
-        value = self.to_string_dollar(args[1])
+        if len(args) == 3:
+            value = self.to_string_dollar(args[2])
+        else:
+            value = self.to_string_dollar(args[1])
 
         return {key: value}
 
@@ -148,7 +151,7 @@ def attribute(self, args: List) -> Attribute:
         key = str(args[0])
         if key.startswith('"') and key.endswith('"'):
             key = key[1:-1]
-        value = self.to_string_dollar(args[1])
+        value = self.to_string_dollar(args[2])
         return Attribute(key, value)
 
     def conditional(self, args: List) -> str: