diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..d4c4ec5 Binary files /dev/null and b/.DS_Store differ diff --git a/.vscode/launch.json b/.vscode/launch.json index 566cbfd..fa97e11 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -75,6 +75,30 @@ "ignoreFailures": true } ] + }, + { + "name": "Char Test", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/tests/char_tests", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + } + ] } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index ee3f40c..91e254b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -67,6 +67,7 @@ "thread": "cpp", "cinttypes": "cpp", "typeinfo": "cpp", - "variant": "cpp" + "variant": "cpp", + "list": "cpp" } } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f91517..241d93e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,4 +28,5 @@ ADD_SUBDIRECTORY(tests) ENABLE_TESTING() ADD_TEST(NAME integer_tests COMMAND integer_tests) ADD_TEST(NAME double_tests COMMAND double_tests) -ADD_TEST(NAME bool_tests COMMAND bool_tests) \ No newline at end of file +ADD_TEST(NAME bool_tests COMMAND bool_tests) +ADD_TEST(NAME char_tests COMMAND char_tests) diff --git a/Hoo.g4 b/Hoo.g4 index 390d548..6278114 100644 --- a/Hoo.g4 +++ b/Hoo.g4 @@ -2,9 +2,25 @@ grammar Hoo; import hoolexer; -statement: literalStatement; +statement: expressionStatement; -literalStatement: literal ';'; +expressionStatement: expression ';'; + +expression: + primary # PrimaryExpression + | expression '*' expression # MultiplicationExpression + | expression '/' expression # DivisionExpression + | expression '%' expression # ReminderExpression + | expression '+' expression # AdditiveExpression + | expression '-' expression # SubtractExpression + | expression '<' '<' expression # LeftShiftExpression + | expression '>' '>' expression # RightShiftExpression + | expression '&' '&' expression # LogicalAndExpression + | expression '|' '|' expression # LogicalOrExpression; + +primary: + literal # PrimaryLiteral + | '(' expression ')' # NestedExpression; literal: INTEGER_LITERAL diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dab7bd8..80dca30 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,12 +11,25 @@ ADD_LIBRARY(hoocore STATIC ParseError.hpp ParseErrorHandler.hpp ParseErrorHandler.cpp - ParseErrorException.hpp) + ParseErrorException.hpp + Utility.hpp + UTF8Char.h) + +ADD_LIBRARY(hooruntime STATIC + UTF8Char.h + UTF8Char.cpp + ) + ADD_EXECUTABLE(hoo Hoo.cpp Visitor.cpp ${ANTLR_GENERATED_DIR}/HooBaseVisitor.cpp ${ANTLR_GENERATED_DIR}/HooVisitor.cpp ${ANTLR_GENERATED_DIR}/HooLexer.cpp ${ANTLR_GENERATED_DIR}/HooParser.cpp) -ADD_DEPENDENCIES(hoo HooBaseVisitor) -TARGET_LINK_LIBRARIES(hoo antlr4-runtime LLVMCore LLVMSupport) + +ADD_DEPENDENCIES(hoo HooBaseVisitor hoocore hooruntime) +TARGET_LINK_LIBRARIES(hoo hoocore + hooruntime + antlr4-runtime + LLVMCore + LLVMSupport) diff --git a/src/UTF8Char.cpp b/src/UTF8Char.cpp new file mode 100644 index 0000000..c65ed4e --- /dev/null +++ b/src/UTF8Char.cpp @@ -0,0 +1,5 @@ +#include "UTF8Char.h" + +#include + +const std::string CHAR_TYPE_NAME = "hoo.UTF8Char"; diff --git a/src/UTF8Char.h b/src/UTF8Char.h new file mode 100644 index 0000000..c65b344 --- /dev/null +++ b/src/UTF8Char.h @@ -0,0 +1,25 @@ +#ifndef UTF8_CHAR_H +#define UTF8_CHAR_H + +#include +#include + +extern "C" const std::string CHAR_TYPE_NAME; +const uint64_t UTF8_CHAR_BYTES = 2; + +#ifdef __cplusplus +extern "C" +{ +#endif + typedef struct + { + + uint8_t length; + uint16_t bytes[UTF8_CHAR_BYTES]; + + } UTF8Char; + +#ifdef __cplusplus +} +#endif +#endif // UTF8Char \ No newline at end of file diff --git a/src/Utility.hpp b/src/Utility.hpp new file mode 100644 index 0000000..a17b277 --- /dev/null +++ b/src/Utility.hpp @@ -0,0 +1,198 @@ +#pragma once + +#include "UTF8Char.h" + +#include +#include +#include +#include +#include + +static const std::map HEX_DIGITS = { + {'0', 0}, + {'1', 1}, + {'2', 2}, + {'3', 3}, + {'4', 4}, + {'5', 5}, + {'6', 6}, + {'7', 7}, + {'8', 8}, + {'9', 9}, + {'a', 10}, + {'b', 11}, + {'c', 12}, + {'d', 13}, + {'e', 14}, + {'f', 15}, + {'A', 10}, + {'B', 11}, + {'C', 12}, + {'D', 13}, + {'E', 14}, + {'F', 15}, +}; + +class Utility +{ +public: + static const std::uint8_t CHAR_NEWLINE = 10; + static const std::uint8_t CHAR_CARRIAGE_RETURN = 13; + static const std::uint8_t CHAR_TAB = 9; + static const std::uint8_t CHAR_BACKSPACE = 8; + static const std::uint8_t CHAR_FORMFEED = 12; + static const std::uint8_t CHAR_BACK_SLASH = 92; + static const std::uint8_t CHAR_SINGLE_QUOTE = 39; + + static const int CHAR_A_UPPER = 65; + static const int CHAR_F_UPPER = 70; + static const int CHAR_0_DIGIT = 48; + static const int CHAR_9_DIGIT = 57; + static const int CHAR_A_LOWER = 97; + static const int CHAR_F_LOWER = 102; + + static UTF8Char getCharType(std::uint8_t ansiChar) + { + auto charType = UTF8Char(); + charType.length = 1; + charType.bytes[0] = ansiChar; + return charType; + } + + static const int UTF8_HEXDIGIT_COUNT = 4; + static const int UTF8_BYTE_COUNT = 2; + + static std::array splitUnicodeString(const std::string &input) + { + if (input.length() != UTF8_HEXDIGIT_COUNT) + { + throw std::invalid_argument("Invalid hex string."); + } + + std::array result; + result[0] = input.substr(0, UTF8_BYTE_COUNT); + result[1] = input.substr(2, UTF8_BYTE_COUNT); + + return result; + } + + static uint8_t + hexStringToByte(const std::string &hexByteString) + { + if (hexByteString.length() != UTF8_BYTE_COUNT) + { + throw std::invalid_argument("Hex string must be exactly 2 characters long."); + } + + char c0 = hexByteString[0]; + char c1 = hexByteString[1]; + auto o0 = HEX_DIGITS.at(c0); + auto o1 = HEX_DIGITS.at(c1); + uint8_t result = (o0 * 16) + o1; + return result; + } + + static uint16_t combineUint8s(uint8_t byte1, uint8_t byte2) + { + uint16_t result = (static_cast(byte1) << 8) | byte2; + return result; + } + + static std::vector utf16CodePointToUTF8(uint16_t codePoint) + { + std::vector utf8Bytes; + + if (codePoint <= 0x7F) + { + utf8Bytes.push_back(static_cast(codePoint)); + } + else if (codePoint <= 0x7FF) + { + utf8Bytes.push_back(static_cast(0xC0 | (codePoint >> 6))); + utf8Bytes.push_back(static_cast(0x80 | (codePoint & 0x3F))); + } + else if (codePoint <= 0xFFFF) + { + utf8Bytes.push_back(static_cast(0xE0 | (codePoint >> 12))); + utf8Bytes.push_back(static_cast(0x80 | ((codePoint >> 6) & 0x3F))); + utf8Bytes.push_back(static_cast(0x80 | (codePoint & 0x3F))); + } + else + { + throw std::invalid_argument("Invalid Unicode code point (outside BMP)."); + } + + return utf8Bytes; + } + + static UTF8Char hexStringToUTF8(const std::string &hexString) + { + auto byteStrings = splitUnicodeString(hexString); + auto byte1 = hexStringToByte(byteStrings[0]); + auto byte2 = hexStringToByte(byteStrings[1]); + auto codePoint = combineUint8s(byte1, byte2); + auto utf8Bytes = utf16CodePointToUTF8(codePoint); + auto utf8Char = UTF8Char{static_cast(utf8Bytes.size()), {0, 0}}; + for (auto index = 0; index < utf8Bytes.size(); ++index) + { + utf8Char.bytes[index] = utf8Bytes[index]; + } + return utf8Char; + } + + static UTF8Char + getChar(std::string charText) + { + charText = charText.substr(1, charText.size() - 2); + bool escape = false; + for (auto index = 0; index < charText.size(); ++index) + { + auto c = charText[index]; + if (escape) + { + switch (c) + { + case 'n': + return getCharType(CHAR_NEWLINE); + case 'r': + return getCharType(CHAR_CARRIAGE_RETURN); + case 't': + return getCharType(CHAR_TAB); + case 'b': + return getCharType(CHAR_BACKSPACE); + case 'f': + return getCharType(CHAR_FORMFEED); + case '\\': + return getCharType(CHAR_BACK_SLASH); + case '\'': + return getCharType(CHAR_SINGLE_QUOTE); + case 'u': + { + std::string unicodeHexDigits = charText.substr(index + 1, 4); + auto utf8Char = hexStringToUTF8(unicodeHexDigits); + return utf8Char; + } + default: + throw std::runtime_error("Invalid char escape sequence"); + } + } + else + { + if (c == '\\') + { + escape = true; + continue; + } + else if (c <= 0x7F) + { + return getCharType((std::uint8_t)c); + } + else + { + throw std::runtime_error("Invalid ansi character"); + } + } + } + return UTF8Char({0, {0, 0}}); + } +}; \ No newline at end of file diff --git a/src/Visitor.cpp b/src/Visitor.cpp index b98f5ac..9bac1cd 100644 --- a/src/Visitor.cpp +++ b/src/Visitor.cpp @@ -1,6 +1,8 @@ #include "Visitor.hpp" #include "Node.hpp" #include "ParseErrorException.hpp" +#include "Utility.hpp" +#include "UTF8Char.h" #include #include @@ -15,12 +17,11 @@ Visitor::Visitor(const std::string &moduleName) : _moduleName(moduleName), std::any Visitor::visitLiteral(HooParser::LiteralContext *ctx) { + auto value = ctx->INTEGER_LITERAL(); #ifndef NDEBUG auto text = ctx->getText(); std::cout << "Literal: " << text << std::endl; #endif - - auto value = ctx->INTEGER_LITERAL(); if (value) { auto decimalText = value->getText(); @@ -59,9 +60,19 @@ std::any Visitor::visitLiteral(HooParser::LiteralContext *ctx) if (value) { auto charText = value->getText(); - char charValue = charText[1]; - llvm::Type *charType = llvm::Type::getInt8Ty(*_context); - llvm::Constant *charConstant = llvm::ConstantInt::get(charType, charValue, true); + auto charValue = Utility::getChar(charText); + + auto byteType = llvm::Type::getInt8Ty(*_context); + auto byteArrayType = llvm::ArrayType::get(byteType, UTF8_CHAR_BYTES); + llvm::StructType *charType = llvm::StructType::create(*_context, CHAR_TYPE_NAME); + charType->setBody({byteType, byteArrayType}); + llvm::Constant *charConstant = llvm::ConstantStruct::get( + charType, + {llvm::ConstantInt::get(byteType, charValue.length), + llvm::ConstantArray::get(byteArrayType, { + llvm::ConstantInt::get(byteType, charValue.bytes[0]), + llvm::ConstantInt::get(byteType, charValue.bytes[1]), + })}); return std::any{Node(NODE_LITERAL, DATATYPE_CHAR, charConstant)}; } @@ -82,23 +93,44 @@ std::any Visitor::visitLiteral(HooParser::LiteralContext *ctx) throw ParseErrorException(_moduleName, line_no, char_pos, message); } -std::any Visitor::visitLiteralStatement(HooParser::LiteralStatementContext *ctx) +std::any Visitor::visitPrimaryLiteral(HooParser::PrimaryLiteralContext *ctx) { - auto literal_context = ctx->literal(); - auto result = visitLiteral(literal_context); - return result; + auto literal = ctx->literal(); + if (literal != nullptr) + { + auto node = visitLiteral(literal); + return node; + } + return std::any(); +} + +std::any Visitor::visitNestedExpression(HooParser::NestedExpressionContext *ctx) +{ + auto expr_ctx = ctx->expression(); + auto node = visit(expr_ctx); + return node; +} + +std::any Visitor::visitExpressionStatement(HooParser::ExpressionStatementContext *ctx) +{ + auto expressionCtx = ctx->expression(); + if (expressionCtx != nullptr) + { + auto node = visit(expressionCtx); + return node; + } + return std::any(); } std::any Visitor::visitStatement(HooParser::StatementContext *ctx) { - auto listeral_stmt_ctx = ctx->literalStatement(); - std::cout << "Statement: " << ctx->getText() << std::endl; - if (listeral_stmt_ctx != nullptr) + auto expr_stmt_ctx = ctx->expressionStatement(); + if (expr_stmt_ctx != nullptr) { - auto result = visitLiteralStatement(listeral_stmt_ctx); - return result; + auto node = visitExpressionStatement(expr_stmt_ctx); + return node; } - return nullptr; + return std::any(); } std::any Visitor::visitUnit(HooParser::UnitContext *ctx) diff --git a/src/Visitor.hpp b/src/Visitor.hpp index 6c6229f..b327400 100644 --- a/src/Visitor.hpp +++ b/src/Visitor.hpp @@ -21,7 +21,9 @@ public: public: std::any visitLiteral(HooParser::LiteralContext *ctx) override; - std::any visitLiteralStatement(HooParser::LiteralStatementContext *ctx) override; + std::any visitPrimaryLiteral(HooParser::PrimaryLiteralContext *ctx) override; + std::any visitNestedExpression(HooParser::NestedExpressionContext *ctx) override; + std::any visitExpressionStatement(HooParser::ExpressionStatementContext *ctx) override; std::any visitStatement(HooParser::StatementContext *ctx) override; std::any visitUnit(HooParser::UnitContext *ctx) override; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6254421..21067f4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,13 +1,17 @@ FIND_PACKAGE (GTest REQUIRED) ADD_EXECUTABLE(integer_tests integer_tests.cpp) -TARGET_LINK_LIBRARIES(integer_tests GTest::GTest GTest::Main hoocore antlr4-runtime LLVMCore LLVMSupport) +TARGET_LINK_LIBRARIES(integer_tests GTest::GTest GTest::Main hoocore hooruntime antlr4-runtime LLVMCore LLVMSupport) TARGET_INCLUDE_DIRECTORIES(integer_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) ADD_EXECUTABLE(double_tests double_tests.cpp) -TARGET_LINK_LIBRARIES(double_tests GTest::GTest GTest::Main hoocore antlr4-runtime LLVMCore LLVMSupport) +TARGET_LINK_LIBRARIES(double_tests GTest::GTest GTest::Main hoocore hooruntime antlr4-runtime LLVMCore LLVMSupport) TARGET_INCLUDE_DIRECTORIES(double_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) ADD_EXECUTABLE(bool_tests bool_tests.cpp) -TARGET_LINK_LIBRARIES(bool_tests GTest::GTest GTest::Main hoocore antlr4-runtime LLVMCore LLVMSupport) -TARGET_INCLUDE_DIRECTORIES(bool_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) \ No newline at end of file +TARGET_LINK_LIBRARIES(bool_tests GTest::GTest GTest::Main hoocore hooruntime antlr4-runtime LLVMCore LLVMSupport) +TARGET_INCLUDE_DIRECTORIES(bool_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) + +ADD_EXECUTABLE(char_tests char_tests.cpp) +TARGET_LINK_LIBRARIES(char_tests GTest::GTest GTest::Main hoocore hooruntime antlr4-runtime LLVMCore LLVMSupport) +TARGET_INCLUDE_DIRECTORIES(char_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) \ No newline at end of file diff --git a/tests/bool_tests.cpp b/tests/bool_tests.cpp index be5b118..789ef01 100644 --- a/tests/bool_tests.cpp +++ b/tests/bool_tests.cpp @@ -55,7 +55,7 @@ TEST_F(BoolTest, LiteralFalse) TEST_F(BoolTest, InvalidLiteral) { auto compiler = std::make_unique("notbool;", "main"); - ASSERT_THROW(compiler->compile(), ParseErrorException); + ASSERT_THROW(compiler->compile(), ParseCollectiveErrorException); } TEST_F(BoolTest, MissingSemicolon) @@ -67,5 +67,5 @@ TEST_F(BoolTest, MissingSemicolon) TEST_F(BoolTest, MixedCaseLiteral) { auto compiler = std::make_unique("True;", "main"); - ASSERT_THROW(compiler->compile(), ParseErrorException); + ASSERT_THROW(compiler->compile(), ParseCollectiveErrorException); } diff --git a/tests/char_tests.cpp b/tests/char_tests.cpp new file mode 100644 index 0000000..675a257 --- /dev/null +++ b/tests/char_tests.cpp @@ -0,0 +1,67 @@ +#include "Compiler.hpp" +#include "Node.hpp" +#include "UTF8Char.h" + +#include "llvm/IR/Constants.h" +#include + +// TEST(CharTest, SingleChar) +// { +// auto compiler = std::make_unique("'a';", "main"); +// auto result = compiler->compile(); +// auto charNode = std::any_cast(result); + +// ASSERT_EQ(charNode.getNodeType(), NODE_LITERAL); +// ASSERT_EQ(charNode.getDataType(), DATATYPE_CHAR); + +// auto value = charNode.getValue(); +// auto expected_value = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*compiler->getContext()), 97); + +// ASSERT_EQ(value, expected_value); +// } + +// TEST(CharTest, SpecialChar) +// { +// auto compiler = std::make_unique("'\\n';", "main"); +// auto result = compiler->compile(); +// auto charNode = std::any_cast(result); + +// ASSERT_EQ(charNode.getNodeType(), NODE_LITERAL); +// ASSERT_EQ(charNode.getDataType(), DATATYPE_CHAR); + +// auto value = charNode.getValue(); +// auto expected_value = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*compiler->getContext()), 10); + +// ASSERT_EQ(value, expected_value); +// } + +TEST(CharTest, UnicodeChar) +{ + auto compiler = std::make_unique("'\\u00E9';", "main"); + auto result = compiler->compile(); + auto charNode = std::any_cast(result); + + ASSERT_EQ(charNode.getNodeType(), NODE_LITERAL); + ASSERT_EQ(charNode.getDataType(), DATATYPE_CHAR); + + auto value = charNode.getValue(); + auto value_name = value->getType()->getStructName().str(); + ASSERT_EQ(value_name, CHAR_TYPE_NAME); + auto structValue = llvm::dyn_cast(value); + ASSERT_NE(structValue, nullptr); + llvm::Value *length = llvm::dyn_cast(structValue->getAggregateElement((unsigned int)0)); + ASSERT_NE(length, nullptr); + auto length_value = llvm::dyn_cast(length)->getZExtValue(); + ASSERT_EQ(length_value, 2); + + llvm::ConstantDataArray *array = llvm::dyn_cast(structValue->getAggregateElement((unsigned int)1)); + ASSERT_NE(array, nullptr); + auto byte1 = array->getElementAsConstant(0); + auto byte2 = array->getElementAsConstant(1); + + auto byte1_value = llvm::dyn_cast(byte1)->getZExtValue(); + ASSERT_EQ(byte1_value, 195); + + auto byte2_value = llvm::dyn_cast(byte2)->getZExtValue(); + ASSERT_EQ(byte2_value, 169); +} diff --git a/tests/integer_tests.cpp b/tests/integer_tests.cpp index 81cf58a..acccb42 100644 --- a/tests/integer_tests.cpp +++ b/tests/integer_tests.cpp @@ -1,9 +1,3 @@ -#include "Compiler.hpp" -#include "Node.hpp" -#include "llvm/IR/Constants.h" - -#include - #include "Compiler.hpp" #include "Node.hpp" #include "llvm/IR/Constants.h" @@ -38,6 +32,11 @@ TEST(IntegerTest, LiteralOne) testIntegerLiteral("1;", 1); } +TEST(NestedIntegerTest, LiteralOne) +{ + testIntegerLiteral("(1);", 1); +} + TEST(IntegerTest, LiteralNumberPositive) { testIntegerLiteral("67890;", 67890);