directcpp 0.1.15

Direct call cpp from Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/usr/bin/env python3
# encoding:utf-8

# NOTE: this is NOT a full functional converter, it is designed to only support to convert
# some format of rust structures into C++ struct while keeping bit by bit compatiable.
# Use it at your OWN RISK and you can extend it as you need.
# the #[repr(C)] attribute is required to make a rust struct being able to be compatiable with C struct.

import re
import sys
import argparse
import io
import bisect


class MyParser:
    def __init__(self, instr, rules, token_func):
        self.con_str = ""
        self.kw_map = dict()
        self.op_map = dict()
        self.eof_comment = None
        # https://en.wikipedia.org/wiki/Unicode_block
        self.kw_start = 0x2460
        self.kw_end = 0x257f
        self.op_start = 0x2200
        self.op_end = 0x22ff
        self.op_map[';'] = ';'
        self.tokens = []
        self.comments = []
        self.current_comment = []
        self.instr = instr
        self.token_func = token_func
        # load rules.
        self.translated_rules = []
        for rl_meta, rl in rules.items():
            assert isinstance(rl, str)
            self._parse_rule(rl, rl_meta)

    def _run(self):
        beg1 = 0
        last_token = None
        instr = self.instr
        while True:
            name, ss, len1 = self.token_func(instr)
            if name == "EOF":
                break
            if name in ['SPACE', "NEWLINE"]:
                if name == 'NEWLINE':
                    last_token = None  # when a new line starts, the comment should attach to the next token.
                beg1 += len1
                instr = instr[len1:]
                continue
            if name in {"COMMENT", "COMMENT1", "COMMENT2", "COMMENT3"}:
                self.comments.append([last_token if last_token is not None else len(self.tokens), ss])
                if self.eof_comment and self.eof_comment in ss:
                    break
                beg1 += len1
                instr = instr[len1:]
                continue
            last_token = len(self.tokens)  # mark position of last important token
            if name == "KEYWORD":
                self.con_str += self._char4kw(ss)
            elif name == "OPERATOR":
                self.con_str += self._char4op(ss)
            elif name == "IDEN" or name == "IDENTIFIER":
                self.con_str += 'I'
            elif name == "NUMBER":
                self.con_str += 'N'
            elif name == "STRING":
                self.con_str += 'S'
            elif name == "STRING1":
                self.con_str += '\u2192'
            elif name == "STRING2":
                self.con_str += '\u21d2'
            else:
                raise RuntimeError(f"unknown token: {name}")
            self.tokens.append((name, ss, beg1, len1))
            beg1 += len1
            instr = instr[len1:]
        self.all_keywords = "".join(self.kw_map.values())
        self.all_operators = "".join(self.op_map.values())

    def _parse_rule(self, rl, rl_meta):
        regex = ""
        while rl:
            if m := re.match(r"(?:KW|KEYWORD)\{(\w+)}", rl):
                regex += self._char4kw(m.group(1))
                rl = rl[m.end():]
            elif m := re.match(r"(?:OP|OPERATOR)\{(.*?)}", rl):
                regex += self._char4op(m.group(1))
                rl = rl[m.end():]
            elif m := re.match(r"(?:OP|OPERATOR)<(.*?)>", rl):
                regex += self._char4op(m.group(1))
                rl = rl[m.end():]
            elif m := re.match(r"IDEN(?:TIFIER)?", rl):
                regex += 'I'
                rl = rl[m.end():]
            elif m := re.match(r"NUM(?:BER)?", rl):
                regex += 'N'
                rl = rl[m.end():]
            elif m := re.match(r"STR(?:ING)?(\d?)", rl):
                if m.group(1) == "1":
                    regex += '\u2192'
                elif m.group(1) == "2":
                    regex += '\u21d2'
                elif m.group(1) == "":
                    regex += 'S'
                else:
                    raise RuntimeError("unknown string type")
                rl = rl[m.end():]
            elif m := re.match(r"\s+", rl):
                rl = rl[m.end():]
            elif m := re.match(r"[(){}\[\]|+\-?*:,.=0-9]+", rl):
                regex += m.group()
                rl = rl[m.end():]
            else:
                raise RuntimeError(f"unknown token: {rl}")
        self.translated_rules.append((re.compile(regex), rl_meta))

    def _char4kw(self, ss):
        if ss in self.kw_map:
            return self.kw_map.get(ss)
        if self.kw_start > self.kw_end:
            raise RuntimeError("too many keywords")
        next_kw = chr(self.kw_start)
        self.kw_start += 1
        self.kw_map[ss] = next_kw
        return next_kw

    def _char4op(self, ss):
        if ss in self.op_map:
            return self.op_map.get(ss)
        if self.op_start > self.op_end:
            raise RuntimeError("too many operators")
        next_op = chr(self.op_start)
        self.op_start += 1
        self.op_map[ss] = next_op
        return next_op

    def _back2str(self, pre_tokens, matched):
        prev_itm = ' '
        rets = ""
        # if prev_itm and itm both in [keywords, identifiers, number, string], we need a space between them.
        need_space = self.all_keywords + "INS\u2192\u21d2"
        for itm in matched:
            _, ss, beg1, len1 = self.tokens[pre_tokens]
            if prev_itm in need_space and itm in need_space:
                rets += ' '
            rets += ss
            pre_tokens += 1
            prev_itm = itm
        return rets

    def _back2strs(self, precnt, m, ng):
        retss = []
        for i in range(ng):
            if m.start(i+1) == -1:
                retss.append(None)
            else:
                retss.append(self._back2str(precnt + m.start(i+1), m.group(i+1)))
        return retss

    def _load_comments(self, start, end):
        s1 = bisect.bisect_left(self.comments, start, key=lambda x: x[0])
        e1 = bisect.bisect_right(self.comments, end, lo=s1, key=lambda x: x[0])
        return [x[1] for x in self.comments[s1:e1]]

    def parse(self):
        self._run()
        pre_tokens = 0
        while self.con_str:
            for regex, rl_meta in self.translated_rules:
                if m := regex.match(self.con_str):
                    self.current_comment = self._load_comments(pre_tokens, pre_tokens+len(m.group()))
                    if self.eof_comment is not None and (self.eof_comment in self.current_comment):
                        return
                    yield rl_meta, self._back2str(pre_tokens, m.group()), self._back2strs(pre_tokens, m, regex.groups)
                    pre_tokens += len(m.group())
                    self.con_str = self.con_str[m.end():]
                    break
            else:
                if self.eof_comment is not None and any([self.eof_comment in x for x in self.current_comment]):
                    return
                ss1 = self._back2str(pre_tokens, self.con_str[0:10])
                raise RuntimeError(f"no rule matched: {ss1}")


class Rust2H:
    @staticmethod
    def get_token(instr):
        if not instr:
            return "EOF", "", 0
        if instr[0] == '\ufeff':
            return "SPACE", instr[0], 1
        if instr[0].isspace():
            i = 1
            while i < len(instr) and instr[i].isspace():
                i += 1
            return "SPACE", instr[:i], i
        if instr.startswith("/*"):
            i = instr.find("*/")
            if i == -1:
                raise Exception("comment not closed")
            return "COMMENT", instr[:i + 2], i + 2
        if instr.startswith("//"):
            i = instr.find("\n")
            if i == -1:
                return "COMMENT", instr, len(instr)
            return "COMMENT", instr[:i], i
        if instr.startswith('"'):
            i = 1
            while i < len(instr) and instr[i] != '"':
                if instr[i] == "\\":
                    i += 2
                else:
                    i += 1
            if i == len(instr):
                raise Exception("string not closed")
            return "STRING2", instr[:i + 1], i + 1
        if instr.startswith("'"):
            i = 1
            while i < len(instr) and instr[i] != "'":
                if instr[i] == "\\":
                    i += 2
                else:
                    i += 1
            if i == len(instr):
                raise Exception("char not closed")
            return "STRING1", instr[:i + 1], i + 1
        if len(instr) > 2 and instr[:2] == "r#":
            i = 2
            while i < len(instr) and instr[i].isalnum():
                i += 1
            return "IDEN", instr[2:i], i
        operators_2 = ("!=", "%=", "&&", "&=", "*=", "+=", "-=", "..", "/=", "<<", "<=", "==", ">=", ">>", "^=", "|=", "||",)
        operators_2b = ("::",)
        if len(instr) >= 3 and instr[:3] in ("..=", "<<=", ">>="):
            return "OPERATOR", instr[:3], 3
        elif len(instr) >= 2 and instr[0:2] in operators_2:
            return "OPERATOR", instr[:2], 2
        elif len(instr) >= 2 and instr[0:2] in operators_2b:
            return "OPERATOR", instr[:2], 2
        elif instr[0] in "!%&*+-/<=>?^|{}[](),.:;#":
            return "OPERATOR", instr[0], 1
        kwds = ["as", "break", "const", "continue", "crate", "else", "enum", "extern", "false", "fn", "for", "if", "impl", "in",
                "let", "loop", "match", "mod", "move", "mut", "pub", "ref", "return", "self", "Self", "static", "struct",
                "super", "trait", "true", "type", "unsafe", "use", "where", "while", "async", "await", "dyn"]
        if instr[0].isalpha() or instr[0] == "_":
            i = 1
            while i < len(instr) and (instr[i].isalnum() or instr[i] == "_"):
                i += 1
            if instr[:i] in kwds:
                return "KEYWORD", instr[:i], i
            return "IDEN", instr[:i], i
        if instr[0].isdigit():
            i = 1
            while i < len(instr) and instr[i].isalnum():
                i += 1
            return "NUMBER", instr[:i], i
        raise Exception(f"unknown token: {instr}")

    def __init__(self, infile, outfile):
        self.outf = io.StringIO()
        self.inf = sys.stdin
        self.outfile = outfile
        if infile:
            self.inf = open(infile, "r", encoding="utf-8")
        self._enum_fields = []
        self._enum_name = ''

    def _trans_type(self, ss, has_err):
        maping_tbl = {"String": "RustString", "u32": "uint32_t", "i32":"int", "u64": "uint64_t", "i64": "int64_t", "usize": "size_t",
                      "isize": "ssize_t", "f32": "float", "f64": "double", "u8": "uint8_t", "i8": "int8_t", "u16": "uint16_t",
                      "i16": "int16_t", "byte": "uint8_t"}
        if m := re.match(r"Vec<(.+)>", ss):
            return f"RustVec<{self._trans_type(m.group(1), has_err)}>"
        if m := re.match(r"Option<(.+)>", ss):
            return f"RustOption<{self._trans_type(m.group(1), has_err)}>"
        if ss in maping_tbl:
            return maping_tbl.get(ss)
        if '<' in ss or "::" in ss:
            has_err[0] = 1
            return f"<ERROR_TYPE({ss})>"
        return ss

    def _build_enum_st(self, has_err):
        tps = [(v[0], self._trans_type(v[1], has_err), v[2]) for v in self._enum_fields if v[1]]
        # comma_tps = ", ".join([v[1] for v in tps])
        current_st = f"\ttype_t type;\n"
        # current_st += f"\t[[no_unique_address]] rust2h_align<{comma_tps}>::pad_t reserved;\n"
        current_st += "\tunion {\n"
        used_varnames = set()
        used_vartypes = set()
        for en, vt, cmt in tps:
            if m := re.search(r'var=(\w+)', cmt):
                varname = m.group(1)
            else:
                varname = en.lower()
            if (varname not in used_varnames) and (vt not in used_vartypes):
                used_varnames.add(varname)
                used_vartypes.add(vt)
                current_st += f"\t\t{vt} {varname};\n"
        current_st += "\t};\n"
        current_st += "};\n"
        return current_st

    def translate(self):
        ATTRS = 10
        STRUCT_START = 20
        STRUCT_FIELD = 30
        STRUCT_FIELD1 = 31
        STRUCT_END = 40
        ENUM_START = 50
        ENUM_FIELD = 60
        USE_1 = 100
        USE_2 = 101
        rules = {
            # struct meta like #[derive(Serialize, Deserialize, Debug, Clone)]
            ATTRS: "OP{#} OP{[} IDEN OP{(} (IDEN (?:OP{=}[STR2|NUM])?  OP{,})* IDEN (?:OP{=}[STR2|NUM])? OP{)} OP{]}",
            # start a pub? struct.
            STRUCT_START: "(KW{pub})? KW{struct} (IDEN) OP<{>",
            # struct field
            STRUCT_FIELD: "(KW{pub})? (IDEN) OP{:} ((?: IDEN OP{::})*IDEN (OP{<} (?:IDEN OP{::})* IDEN OP{>})?) (?: OP{,} | (?= OP<}> ) )",
            STRUCT_FIELD1: "(KW{pub})? (IDEN) OP{:} ((?: IDEN OP{::})*IDEN OP{<} IDEN OP{,} IDEN OP{>}) OP{,}?",
            # struct end
            STRUCT_END: "OP<}> OP{;}?",

            ENUM_START: "(KW{pub})? KW{enum} (IDEN) OP<{>",
            ENUM_FIELD: "(IDEN) (?: OP<(> (IDEN) OP<)> )? OP{,}?",
            USE_1: "KW{use} (?: OP{::} KW{crate} OP{::})? (?:IDEN OP{::})* IDEN OP{;}",
            USE_2: "KW{use} (?: OP{::} KW{crate} OP{::})? (?:IDEN OP{::})* OP<{> ( IDEN OP{,})* IDEN  OP<}> OP{;}",
        }
        instr = self.inf.read()
        p = MyParser(instr, rules, Rust2H.get_token)
        p.eof_comment = "__end_of_rust2h_header__"
        ST_INIT, ST_STRUCT, ST_END, ST_ENUM = 0, 1, 2, 3
        state = ST_INIT
        attrs = []
        should_trans = False
        current_st = ""
        has_err = [0]
        for tp, ss, grps in p.parse():
            if tp in (USE_1, USE_2):
                continue
            if tp == ATTRS:
                if state == ST_INIT:
                    attrs.append(ss)
            elif tp == STRUCT_START and state == ST_INIT:
                should_trans = grps[0] == "pub" and any(["repr(C)" in x for x in attrs])
                current_st += f"struct {grps[1]} {{\n"
                state = ST_STRUCT
                if not should_trans:
                    print(f"Skip struct {grps[1]} since it's not repr(C)", file=sys.stderr)
            elif tp == STRUCT_FIELD and state == ST_STRUCT:
                current_st += "\t"
                current_st += self._trans_type(grps[2], has_err)
                current_st += f" {grps[1]};\n"
            elif tp == STRUCT_END and state in (ST_STRUCT, ST_ENUM):
                current_st += "%s};\n" % ("\t" if state == ST_ENUM else "")
                attrs = []
                if should_trans:
                    if has_err[0]:
                        print(f"Warning: the struct/enum has unknown type of fields, please fixit and retry:\n{current_st}", file=sys.stderr)
                    else:
                        if state == ST_ENUM:
                            current_st += self._build_enum_st(has_err)
                        print(current_st, file=self.outf)
                current_st = ""
                has_err = [0]
                state = ST_INIT
            elif tp == ENUM_START and state == ST_INIT:
                should_trans = grps[0] == "pub" and any(["repr(C)" in x for x in attrs])
                current_st += f"struct {grps[1]} {{\n\tenum type_t : int {{\n"
                state = ST_ENUM
                self._enum_fields = []
                self._enum_name = grps[1]
                if not should_trans:
                    print(f"Skip enum {grps[1]} since it's not repr(C)", file=sys.stderr)
            elif tp == ENUM_FIELD and state == ST_ENUM:
                converted = re.sub(r'[A-Z]', lambda x: '_' + x.group().lower(), grps[0])
                grps[0] = re.sub(r'^_', '', converted).upper()
                self._enum_fields.append([grps[0], grps[1], "\n".join(p.current_comment)])
                current_st += f"\t\t{grps[0]},\n"
            elif tp == STRUCT_FIELD1 and state == ST_STRUCT:
                assert not should_trans, "complex member should only be in ignored struct"
            else:
                assert False, f"unkown branch {tp=} {state=}"
        self.flush()

    def flush(self):
        content = self.outf.getvalue()
        if any([c for c in content if ord(c) > 127]):
            encoding = "utf-8-sig"
        else:
            encoding = "utf-8"
        header = "#pragma once\n"
        if "RustOption" in content:
            header += """#include "rust/rust-common.h"\n"""
        else:
            header += """#include "rust/rust-spt.h"\n"""
        content = header + "\n" + content
        self.outf.close()
        self.inf.close()

        if self.outfile:
            changed = True
            try:
                with open(self.outfile, "r", encoding="utf-8") as f:
                    rd = f.read()
                    if rd.startswith('\ufeff'):
                        rd = rd[1:]
                    changed = rd != content
            except FileNotFoundError:
                pass
            if changed:
                with open(self.outfile, "w", encoding=encoding) as f:
                    f.write(content)
            else:
                print(f"No changes for file {self.outfile}", file=sys.stderr)
        else:
            sys.stdout.write(content)
            sys.stdout.flush()


if __name__ == "__main__":
    def main():
        ap = argparse.ArgumentParser()
        ap.add_argument('-o', dest="outfile", help="Output file")
        ap.add_argument("infile", help="Input file")
        args = ap.parse_args()
        Rust2H(args.infile, args.outfile).translate()


    main()