История изменений

Исправление rtxtxtrx, 22.12.23 14:38 (текущая версия) :

Кстати, я хоть и нафлудил с этими реализациями парсеров, но структуированный текст парсить не сложнее, я пример тоже набросал:

from __future__ import annotations

import string
import typing
from dataclasses import dataclass, field
from io import StringIO


class Node:
    pass


@dataclass
class Element(Node):
    name: str
    attrs: dict[str, str] = field(default_factory=dict)
    children: list[Node] = field(default_factory=list)


@dataclass
class Text(Node):
    value: str


class SyntaxError(Exception):
    pass


class MLParser:
    def readch(self) -> str:
        return self.fp.read(1)

    def advance(self) -> None:
        self.curch, self.nextch = self.nextch, self.readch()

    def match(self, charset: str) -> bool:
        if self.nextch and self.nextch in charset:
            self.advance()
            return True
        return False

    def expect(self, charset: str) -> None:
        if not self.match(charset):
            raise SyntaxError(f"syntax error at offset {self.fp.tell()}")

    def parse_name(self) -> str:
        rv = ""
        while self.match(string.ascii_letters):
            rv += self.curch
        if not rv:
            raise SyntaxError("required name")
        return rv

    def skip_spaces(self) -> None:
        while self.match(string.whitespace):
            pass

    def parse_quouted(self) -> str:
        self.expect('"')
        rv = ""
        while True:
            if not self.nextch:
                raise SyntaxError("unexpected end")
            if self.match('"'):
                break
            self.advance()
            rv += self.curch
        return rv

    def handle_close_tag(self) -> None:
        name = self.parse_name()
        self.expect(">")
        if not self.open_tags or self.open_tags.pop() != name:
            raise SyntaxError(f"unexpected close tag {name}")
        # self.skip_spaces()

    def collect_children(self, node: Element) -> None:
        text = ""
        while self.nextch:
            if self.match("<"):
                if text:
                    node.children.append(Text(text))
                    text = ""
                if self.match("/"):
                    self.handle_close_tag()
                    break
                child = Element(self.parse_name())
                self.open_tags.append(child.name)
                while True:
                    if self.match(">"):
                        break
                    self.expect(string.whitespace)
                    self.skip_spaces()
                    attr = self.parse_name()
                    self.skip_spaces()
                    self.expect("=")
                    self.skip_spaces()
                    child.attrs[attr] = self.parse_quouted()
                self.collect_children(child)
                node.children.append(child)
            else:
                self.advance()
                text += self.curch
        if text:
            node.children.append(Text(text))

    def parse(self, fp: typing.TextIO) -> Element:
        self.fp = fp
        self.curch = self.nextch = None
        self.open_tags = []
        self.advance()
        root = Element("root")
        self.collect_children(root)
        if self.open_tags:
            raise SyntaxError(f"unclosed tags: {', '.join(self.open_tags)}")
        return root


import sys

print(MLParser().parse(StringIO(sys.argv[1])))

Вроде нормально все парсит:

python ml.py 'Hello, <span color="red"><i><b>world</b></i></span>!'
Element(name='root', attrs={}, children=[Text(value='Hello, '), Element(name='span', attrs={'color': 'red'}, children=[Element(name='i', attrs={}, children=[Element(name='b', attrs={}, children=[Text(value='world')])])]), Text(value='!')])

Для твоего диалекта текса изменения нужны минимальные. Или че ты там придумываешь. Простая задача, но малоприменимая или не знаю, что там придумать: текс в хтмл превращать? Зато показательно тотальное превосходство питона над статически-типизированными недоязыками. В нем НЕ НУЖНЫ DDD, паттерны и пр чушь, хотя я уверен, что найдутся аметисты, которые даже тут их обнаружат (считаю Java абсолютным злом как и заучивание [ее и применимых только в ней] паттернов)

Исправление rtxtxtrx, 22.12.23 14:37:

from __future__ import annotations

import string
import typing
from dataclasses import dataclass, field
from io import StringIO


class Node:
    pass


@dataclass
class Element(Node):
    name: str
    attrs: dict[str, str] = field(default_factory=dict)
    children: list[Node] = field(default_factory=list)


@dataclass
class Text(Node):
    value: str


class SyntaxError(Exception):
    pass


class MLParser:
    def readch(self) -> str:
        return self.fp.read(1)

    def advance(self) -> None:
        self.curch, self.nextch = self.nextch, self.readch()

    def match(self, charset: str) -> bool:
        if self.nextch and self.nextch in charset:
            self.advance()
            return True
        return False

    def expect(self, charset: str) -> None:
        if not self.match(charset):
            raise SyntaxError(f"syntax error at offset {self.fp.tell()}")

    def parse_name(self) -> str:
        rv = ""
        while self.match(string.ascii_letters):
            rv += self.curch
        if not rv:
            raise SyntaxError("required name")
        return rv

    def skip_spaces(self) -> None:
        while self.match(string.whitespace):
            pass

    def parse_quouted(self) -> str:
        self.expect('"')
        rv = ""
        while True:
            if not self.nextch:
                raise SyntaxError("unexpected end")
            if self.match('"'):
                break
            self.advance()
            rv += self.curch
        return rv

    def handle_close_tag(self) -> None:
        name = self.parse_name()
        self.expect(">")
        if not self.open_tags or self.open_tags.pop() != name:
            raise SyntaxError(f"unexpected close tag {name}")
        # self.skip_spaces()

    def collect_children(self, node: Element) -> None:
        text = ""
        while self.nextch:
            if self.match("<"):
                if text:
                    node.children.append(Text(text))
                    text = ""
                if self.match("/"):
                    self.handle_close_tag()
                    break
                child = Element(self.parse_name())
                self.open_tags.append(child.name)
                while True:
                    if self.match(">"):
                        break
                    self.expect(string.whitespace)
                    self.skip_spaces()
                    attr = self.parse_name()
                    self.skip_spaces()
                    self.expect("=")
                    self.skip_spaces()
                    child.attrs[attr] = self.parse_quouted()
                self.collect_children(child)
                node.children.append(child)
            else:
                self.advance()
                text += self.curch
        if text:
            node.children.append(Text(text))

    def parse(self, fp: typing.TextIO) -> Element:
        self.fp = fp
        self.curch = self.nextch = None
        self.open_tags = []
        self.advance()
        root = Element("root")
        self.collect_children(root)
        if self.open_tags:
            raise SyntaxError(f"unclosed tags: {', '.join(self.open_tags)}")
        return root


import sys

print(MLParser().parse(StringIO(sys.argv[1])))

Вроде нормально все парсит:

python ml.py 'Hello, <span color="red"><i><b>world</b></i></span>!'
Element(name='root', attrs={}, children=[Text(value='Hello, '), Element(name='span', attrs={'color': 'red'}, children=[Element(name='i', attrs={}, children=[Element(name='b', attrs={}, children=[Text(value='world')])])]), Text(value='!')])

Для твоего диалекта текса изменения нужны минимальные. Или че ты там придумываешь. Простая задача, но малоприменимая. Зато показательно тотальное превосходство питона над статически-типизированными недоязыками. В нем НЕ НУЖНЫ DDD, паттерны и пр чушь, хотя я уверен, что найдутся аметисты, которые даже тут их обнаружат (считаю Java абсолютным злом как и заучивание [ее и применимых только в ней] паттернов)

Исправление rtxtxtrx, 22.12.23 14:33:

from __future__ import annotations

import string
import typing
from dataclasses import dataclass, field
from io import StringIO


class Node:
    pass


@dataclass
class Element(Node):
    name: str
    attrs: dict[str, str] = field(default_factory=dict)
    children: list[Node] = field(default_factory=list)


@dataclass
class Text(Node):
    value: str


class SyntaxError(Exception):
    pass


class MLParser:
    def readch(self) -> str:
        return self.fp.read(1)

    def advance(self) -> None:
        self.curch, self.nextch = self.nextch, self.readch()

    def match(self, charset: str) -> bool:
        if self.nextch and self.nextch in charset:
            self.advance()
            return True
        return False

    def expect(self, charset: str) -> None:
        if not self.match(charset):
            raise SyntaxError(f"syntax error at offset {self.fp.tell()}")

    def parse_name(self) -> str:
        rv = ""
        while self.match(string.ascii_letters):
            rv += self.curch
        if not rv:
            raise SyntaxError("required name")
        return rv

    def skip_spaces(self) -> None:
        while self.match(string.whitespace):
            pass

    def parse_quouted(self) -> str:
        self.expect('"')
        rv = ""
        while True:
            if not self.nextch:
                raise SyntaxError("unexpected end")
            if self.match('"'):
                break
            self.advance()
            rv += self.curch
        return rv

    def handle_close_tag(self) -> None:
        name = self.parse_name()
        self.expect(">")
        if not self.open_tags or self.open_tags.pop() != name:
            raise SyntaxError(f"unexpected close tag {name}")
        # self.skip_spaces()

    def collect_children(self, node: Element) -> None:
        text = ""
        while self.nextch:
            if self.match("<"):
                if text:
                    node.children.append(Text(text))
                    text = ""
                if self.match("/"):
                    self.handle_close_tag()
                    break
                child = Element(self.parse_name())
                self.open_tags.append(child.name)
                while True:
                    if self.match(">"):
                        break
                    self.expect(string.whitespace)
                    self.skip_spaces()
                    attr = self.parse_name()
                    self.skip_spaces()
                    self.expect("=")
                    self.skip_spaces()
                    child.attrs[attr] = self.parse_quouted()
                self.collect_children(child)
                node.children.append(child)
            else:
                self.advance()
                text += self.curch
        if text:
            node.children.append(Text(text))

    def parse(self, fp: typing.TextIO) -> Element:
        self.fp = fp
        self.curch = self.nextch = None
        self.open_tags = []
        self.advance()
        root = Element("root")
        self.collect_children(root)
        if self.open_tags:
            raise SyntaxError(f"unclosed tags: {', '.join(self.open_tags)}")
        return root


import sys

print(MLParser().parse(StringIO(sys.argv[1])))

Вроде нормально все парсит:

python ml.py 'Hello, <span color="red"><i><b>world</b></i></span>!'
Element(name='root', attrs={}, children=[Text(value='Hello, '), Element(name='span', attrs={'color': 'red'}, children=[Element(name='i', attrs={}, children=[Element(name='b', attrs={}, children=[Text(value='world')])])]), Text(value='!')])

Исходная версия rtxtxtrx, 22.12.23 14:21:

from __future__ import annotations

import string
import typing
from dataclasses import dataclass, field
from io import StringIO


class Node:
    pass


@dataclass
class Element(Node):
    name: str
    attrs: dict[str, str] = field(default_factory=dict)
    children: list[Node] = field(default_factory=list)


@dataclass
class Text(Node):
    value: str


class SyntaxError(Exception):
    pass


class MLParser:
    def readch(self) -> str:
        return self.fp.read(1)

    def advance(self) -> None:
        self.curch, self.nextch = self.nextch, self.readch()

    def match(self, charset: str) -> bool:
        if self.nextch and self.nextch in charset:
            self.advance()
            return True
        return False

    def expect(self, charset: str) -> None:
        if not self.match(charset):
            raise SyntaxError(f"syntax error at offset {self.fp.tell()}")

    def parse_name(self) -> str:
        rv = ""
        while self.match(string.ascii_letters):
            rv += self.curch
        if not rv:
            raise SyntaxError("required name")
        return rv

    def skip_spaces(self) -> None:
        while self.match(string.whitespace):
            pass

    def parse_quouted(self) -> str:
        self.expect('"')
        rv = ""
        while True:
            if not self.nextch:
                raise SyntaxError("unexpected end")
            if self.match('"'):
                break
            self.advance()
            rv += self.curch
        return rv

    def handle_close_tag(self) -> None:
        name = self.parse_name()
        self.expect(">")
        if not self.open_tags or self.open_tags.pop() != name:
            raise SyntaxError(f"unexpected close tag {name}")
        # self.skip_spaces()

    def collect_children(self, node: Element) -> None:
        text = ""
        while self.nextch:
            if self.match("<"):
                if text:
                    node.children.append(Text(text))
                    text = ""
                if self.match("/"):
                    self.handle_close_tag()
                    break
                child = Element(self.parse_name())
                self.open_tags.append(child.name)
                while True:
                    if self.match(">"):
                        break
                    self.expect(string.whitespace)
                    self.skip_spaces()
                    attr = self.parse_name()
                    self.skip_spaces()
                    self.expect("=")
                    self.skip_spaces()
                    child.attrs[attr] = self.parse_quouted()
                self.collect_children(child)
                node.children.append(child)
            else:
                self.advance()
                text += self.curch
        if text:
            node.children.append(Text(text))

    def parse(self, fp: typing.TextIO) -> Element:
        self.fp = fp
        self.curch = self.nextch = None
        self.open_tags = []
        self.advance()
        root = Element("root")
        self.collect_children(root)
        if self.open_tags:
            raise SyntaxError(f"unclosed tags: {', '.join(self.open_tags)}")
        return root


import sys

print(MLParser().parse(StringIO(sys.argv[1])))

Вроде нормально все парсит:

python ml.py 'Hello, <span color="red"><i><b>world</b></i></span>!'
Element(name='root', attrs={}, children=[Text(value='Hello, '), Element(name='span', attrs={'color': 'red'}, children=[Element(name='i', attrs={}, children=[Element(name='b', attrs={}, children=[Text(value='world')])])]), Text(value='!')])