//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A parse forest represents a set of possible parse trees efficiently, it is
// produced by the GLR parser.
//
// Despite the name, its data structure is a tree-like DAG with a single root.
// Multiple ways to parse the same tokens are presented as an ambiguous node
// with all possible interpretations as children.
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
// "expr := expr + expr", they will share a Sequence node representing the expr.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_FOREST_H
#define CLANG_PSEUDO_FOREST_H

#include "clang-pseudo/Grammar.h"
#include "clang-pseudo/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>

namespace clang {
namespace pseudo {

// A node represents ways to parse a sequence of tokens, it interprets a fixed
// range of tokens as a fixed grammar symbol.
//
// There are different kinds of nodes, some nodes have "children" (stored in a
// trailing array) and have pointers to them. "Children" has different semantics
// depending on the node kinds. For an Ambiguous node, it means all
// possible interpretations; for a Sequence node, it means each symbol on the
// right hand side of the production rule.
//
// Since this is a node in a DAG, a node may have multiple parents. And a node
// doesn't have parent pointers.
class alignas(class ForestNode *) ForestNode {
public:
  enum Kind : uint8_t {
    // A Terminal node is a single terminal symbol bound to a token.
    Terminal,
    // A Sequence node is a nonterminal symbol parsed from a grammar rule,
    // elements() are the parses of each symbol on the RHS of the rule.
    // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
    // are [X, Y, Z].
    Sequence,
    // An Ambiguous node exposes multiple ways to interpret the code as the
    // same symbol, alternatives() are all possible parses.
    Ambiguous,
    // An Opaque node is a placeholder. It asserts that tokens match a symbol,
    // without saying how.
    // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
    // code).
    Opaque,
  };
  Kind kind() const { return K; }

  SymbolID symbol() const { return Symbol; }

  // The start of the token range, it is a poistion within a token stream.
  Token::Index startTokenIndex() const { return StartIndex; }

  // Returns the corresponding grammar rule.
  // REQUIRES: this is a Sequence node.
  RuleID rule() const {
    assert(kind() == Sequence);
    return Data & ((1 << RuleBits) - 1);
  }
  // Returns the parses of each element on the RHS of the rule.
  // REQUIRES: this is a Sequence node;
  llvm::ArrayRef<const ForestNode *> elements() const {
    assert(kind() == Sequence);
    return children(Data >> RuleBits);
  };

  // Returns all possible interpretations of the code.
  // REQUIRES: this is an Ambiguous node.
  llvm::ArrayRef<const ForestNode *> alternatives() const {
    assert(kind() == Ambiguous);
    return children(Data);
  }

  std::string dump(const Grammar &) const;
  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;

private:
  friend class ForestArena;

  ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
      : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}

  ForestNode(const ForestNode &) = delete;
  ForestNode &operator=(const ForestNode &) = delete;
  ForestNode(ForestNode &&) = delete;
  ForestNode &operator=(ForestNode &&) = delete;

  static uint16_t sequenceData(RuleID Rule,
                               llvm::ArrayRef<const ForestNode *> Elements) {
    assert(Rule < (1 << RuleBits));
    assert(Elements.size() < (1 << (16 - RuleBits)));
    return Rule | Elements.size() << RuleBits;
  }
  static uint16_t
  ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
    return Alternatives.size();
  }

  // Retrieves the trailing array.
  llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
    return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
                              Num);
  }

  Token::Index StartIndex;
  Kind K : 4;
  SymbolID Symbol : SymbolBits;
  // Sequence - child count : 4 | RuleID : RuleBits (12)
  // Ambiguous - child count : 16
  // Terminal, Opaque - unused
  uint16_t Data;
  // An array of ForestNode* following the object.
};
// ForestNode may not be destroyed (for BumpPtrAllocator).
static_assert(std::is_trivially_destructible<ForestNode>(), "");

// A memory arena for the parse forest.
class ForestArena {
public:
  llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
  ForestNode &createSequence(SymbolID SID, RuleID RID,
                             llvm::ArrayRef<const ForestNode *> Elements) {
    assert(!Elements.empty());
    return create(ForestNode::Sequence, SID,
                  Elements.front()->startTokenIndex(),
                  ForestNode::sequenceData(RID, Elements), Elements);
  }
  ForestNode &createAmbiguous(SymbolID SID,
                              llvm::ArrayRef<const ForestNode *> Alternatives) {
    assert(!Alternatives.empty());
    assert(llvm::all_of(Alternatives,
                        [SID](const ForestNode *Alternative) {
                          return SID == Alternative->symbol();
                        }) &&
           "Ambiguous alternatives must represent the same symbol!");
    return create(ForestNode::Ambiguous, SID,
                  Alternatives.front()->startTokenIndex(),
                  ForestNode::ambiguousData(Alternatives), Alternatives);
  }
  ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
    return create(ForestNode::Opaque, SID, Start, 0, {});
  }

  ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
    return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
  }

  size_t nodeCount() const { return NodeCount; }
  size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }

private:
  ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
                     uint16_t Data,
                     llvm::ArrayRef<const ForestNode *> Elements) {
    ++NodeCount;
    ForestNode *New = new (Arena.Allocate(
        sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
        alignof(ForestNode))) ForestNode(K, SID, Start, Data);
    if (!Elements.empty())
      llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
    return *New;
  }

  llvm::BumpPtrAllocator Arena;
  uint32_t NodeCount = 0;
};

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_FOREST_H
