parser-c 0.3.0

Macros for parser-c.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
   "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">

<book id="happy">
  <bookinfo>
    <date>2001-4-27</date>
    <title>Happy User Guide</title>
    <author>
      <firstname>Simon</firstname>
      <surname>Marlow</surname>
    </author>
    <author>
      <firstname>Andy</firstname>
      <surname>Gill</surname>
    </author>
    <address><email>simonmar@microsoft.com</email></address>
    <copyright>
      <year>1997-2009</year>
      <holder>Simon Marlow</holder>
    </copyright>
    <abstract>
      <para>This document describes Happy, the Haskell Parser
	Generator, version 1.18.</para>
    </abstract>
  </bookinfo>

  <!-- Table of contents -->
  <toc></toc>

<!-- Introduction ========================================================= -->

  <chapter id="happy-introduction">
    <title>Introduction</title>


    <para> <application>Happy</application> is a parser generator
    system for Haskell, similar to the tool
    <application>yacc</application> for C.  Like
    <application>yacc</application>, it takes a file containing an
    annotated BNF specification of a grammar and produces a Haskell
    module containing a parser for the grammar. </para>

    <indexterm><primary>yacc</primary></indexterm>

    <para> <application>Happy</application> is flexible: you can have several
    <application>Happy</application> parsers in the same program, and
    each parser may have multiple entry points.
    <application>Happy</application> can work in conjunction with a
    lexical analyser supplied by the user (either hand-written or
    generated by another program), or it can parse a stream of
    characters directly (but this isn't practical in most cases).  In
    a future version we hope to include a lexical analyser generator
    with <application>Happy</application> as a single package. </para>

    <para> Parsers generated by <application>Happy</application> are
    fast; generally faster than an equivalent parser written using
    parsing combinators or similar tools.  Furthermore, any future
    improvements made to <application>Happy</application> will benefit
    an existing grammar, without need for a rewrite. </para>

    <para> <application>Happy</application> is sufficiently powerful
    to parse full Haskell
    - <ulink url="http://www.haskell.org/ghc">GHC</ulink> itself uses
    a Happy parser.</para>

    <indexterm><primary><literal>hsparser</literal></primary></indexterm>
    <indexterm>
      <primary>Haskell parser</primary>
      <see><literal>hsparser</literal></see>
    </indexterm>

    <para> <application>Happy</application> can currently generate
    four types of parser from a given grammar, the intention being
    that we can experiment with different kinds of functional code to
    see which is the best, and compiler writers can use the different
    types of parser to tune their compilers.  The types of parser
    supported are: </para>

    <orderedlist>

      <listitem id="item-default-backend">
        <para><quote>standard</quote> Haskell 98 (should work with any compiler
	that compiles Haskell 98).</para>
      </listitem>

      <listitem>
        <para>standard Haskell using arrays
	<indexterm scope="all"><primary>arrays</primary></indexterm>
	<indexterm scope="all"><primary>back-ends</primary><secondary>arrays</secondary></indexterm>
	(this is not the default
	because we have found this generates slower parsers than <xref
	linkend="item-default-backend"/>).</para>
      </listitem>

      <listitem>
        <para>Haskell with GHC
	<indexterm><primary>GHC</primary></indexterm>
	<indexterm><primary>back-ends</primary><secondary>GHC</secondary></indexterm>
	(Glasgow Haskell) extensions. This is a
	slightly faster option than <xref
	linkend="item-default-backend"/> for Glasgow Haskell
	users.</para>
      </listitem>


      <listitem>
	<para>GHC Haskell with string-encoded arrays.  This is the
	fastest/smallest option for GHC users.  If you're using GHC,
	the optimum flag settings are <literal>-agc</literal> (see
	<xref linkend="sec-invoking"/>).</para>
      </listitem>

    </orderedlist>

    <para>Happy can also generate parsers which will dump debugging
    information at run time, showing state transitions and the input
    tokens to the parser.</para>

    <sect1 id="sec-compatibility">
      <title>Compatibility</title>

      <para> <application>Happy</application> is written in Glasgow Haskell.  This
      means that (for the time being), you need GHC to compile it.
      Any version of GHC >= 6.2 should work.</para>

      <para> Remember: parsers produced using
      <application>Happy</application> should compile without
      difficulty under any Haskell 98 compiler or interpreter.<footnote><para>With one
	exception: if you have a production with a polymorphic type signature,
	then a compiler that supports local universal quantification is
	required.  See <xref linkend="sec-type-signatures" />.</para>
	</footnote></para>
    </sect1>

    <sect1 id="sec-reporting-bugs">
      <title>Reporting Bugs</title>

      <indexterm>
	<primary>bugs, reporting</primary>
      </indexterm>

      <para> Any bugs found in <application>Happy</application> should
      be reported to me: Simon Marlow
      <email>marlowsd@gmail.com</email> including all the relevant
      information: the compiler used to compile
      <application>Happy</application>, the command-line options used,
      your grammar file or preferably a cut-down example showing the
      problem, and a description of what goes wrong.  A patch to fix
      the problem would also be greatly appreciated. </para>

      <para> Requests for new features should also be sent to the
      above address, especially if accompanied by patches :-).</para>

    </sect1>

    <sect1 id="sec-license">
      <title>License</title>

      <indexterm>
	<primary>License</primary>
      </indexterm>

      <para> Previous versions of <application>Happy</application>
      were covered by the GNU general public license.  We're now
      distributing <application>Happy</application> with a less
      restrictive BSD-style license.  If this license doesn't work for
      you, please get in touch.</para>

      <blockquote>
	<para> Copyright 2009, Simon Marlow and Andy Gill.  All rights
	reserved. </para>

	<para> Redistribution and use in source and binary forms, with
	or without modification, are permitted provided that the
	following conditions are met: </para>

	<itemizedlist>
	  <listitem>
	    <para>Redistributions of source code must retain the above
            copyright notice, this list of conditions and the
            following disclaimer.</para>
	  </listitem>

	  <listitem>
	    <para> Redistributions in binary form must reproduce the
            above copyright notice, this list of conditions and the
            following disclaimer in the documentation and/or other
            materials provided with the distribution.</para>
	  </listitem>
	</itemizedlist>

	<para>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS
        IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
        LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
        FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
        SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT,
        INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
        SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
        OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
        LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
        THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
        OF SUCH DAMAGE.</para>
      </blockquote>
    </sect1>

    <sect1 id="sec-obtaining">
      <title>Obtaining <application>Happy</application></title>

      <para> <application>Happy</application>'s web page can be found at <ulink
      url="http://www.haskell.org/happy/">http://www.haskell.org/happy/</ulink>.
      <application>Happy</application> source and binaries can be downloaded from
      there.</para>

    </sect1>

  </chapter>

<!-- Using Happy =========================================================== -->

  <chapter id="sec-using">
    <title>Using <application>Happy</application></title>

  <para> Users of <application>Yacc</application> will find
  <application>Happy</application> quite familiar.  The basic idea is
  as follows: </para>

  <itemizedlist>
    <listitem>
      <para>Define the grammar you want to parse in a
      <application>Happy</application> grammar file. </para>
    </listitem>

    <listitem>
      <para> Run the grammar through <application>Happy</application>, to generate
      a compilable Haskell module.</para>
    </listitem>

    <listitem>
      <para> Use this module as part of your Haskell program, usually
      in conjunction with a lexical analyser (a function that splits
      the input into <quote>tokens</quote>, the basic unit of parsing).</para>
    </listitem>
  </itemizedlist>

  <para> Let's run through an example.  We'll implement a parser for a
  simple expression syntax, consisting of integers, variables, the
  operators <literal>+</literal>, <literal>-</literal>, <literal>*</literal>,
  <literal>/</literal>, and the form <literal>let var = exp in exp</literal>.
  The grammar file starts off like this:</para>

<programlisting>
{
module Main where
}
</programlisting>

    <para>At the top of the file is an optional <firstterm>module
    header</firstterm>,
      <indexterm>
	<primary>module</primary>
	<secondary>header</secondary>
      </indexterm>
    which is just a Haskell module header enclosed in braces.  This
    code is emitted verbatim into the generated module, so you can put
    any Haskell code here at all.  In a grammar file, Haskell code is
    always contained between curly braces to distinguish it from the
    grammar.</para>

    <para>In this case, the parser will be a standalone program so
    we'll call the module <literal>Main</literal>.</para>

    <para>Next comes a couple of declarations:</para>

<programlisting>
%name calc
%tokentype { Token }
%error { parseError }
</programlisting>

    <indexterm>
      <primary><literal>%name</literal></primary>
    </indexterm>
    <indexterm>
      <primary><literal>%tokentype</literal></primary>
    </indexterm>
    <indexterm>
      <primary><literal>%error</literal></primary>
    </indexterm>

    <para>The first line declares the name of the parsing function
    that <application>Happy</application> will generate, in this case
    <literal>calc</literal>.  In many cases, this is the only symbol you need
    to export from the module.</para>

    <para>The second line declares the type of tokens that the parser
    will accept.  The parser (i.e. the function
    <function>calc</function>) will be of type <literal>[Token] ->
    T</literal>, where <literal>T</literal> is the return type of the
    parser, determined by the production rules below.</para>

    <para>The <literal>%error</literal> directive tells Happy the name
    of a function it should call in the event of a parse error.  More
    about this later.</para>

    <para>Now we declare all the possible tokens:</para>

<programlisting>
%token
      let             { TokenLet }
      in              { TokenIn }
      int             { TokenInt $$ }
      var             { TokenVar $$ }
      '='             { TokenEq }
      '+'             { TokenPlus }
      '-'             { TokenMinus }
      '*'             { TokenTimes }
      '/'             { TokenDiv }
      '('             { TokenOB }
      ')'             { TokenCB }
</programlisting>

    <indexterm>
      <primary><literal>%token</literal></primary>
    </indexterm>

    <para>The symbols on the left are the tokens as they will be
    referred to in the rest of the grammar, and to the right of each
    token enclosed in braces is a Haskell pattern that matches the
    token.  The parser will expect to receive a stream of tokens, each
    of which will match one of the given patterns (the definition of
    the <literal>Token</literal> datatype is given later).</para>

    <para>The <literal>&dollar;&dollar;</literal> symbol is a placeholder that
    represents the <emphasis>value</emphasis> of this token.  Normally the value
    of a token is the token itself, but by using the
    <literal>&dollar;&dollar;</literal> symbol you can specify some component
    of the token object to be the value. </para>

    <indexterm>
      <primary><literal>&dollar;&dollar;</literal></primary>
    </indexterm>

    <para>Like yacc, we include <literal>%%</literal> here, for no real
    reason.</para>

<programlisting>
%%
</programlisting>

    <para>Now we have the production rules for the grammar.</para>

<programlisting>
Exp   : let var '=' Exp in Exp  { Let $2 $4 $6 }
      | Exp1                    { Exp1 $1 }

Exp1  : Exp1 '+' Term           { Plus $1 $3 }
      | Exp1 '-' Term           { Minus $1 $3 }
      | Term                    { Term $1 }

Term  : Term '*' Factor         { Times $1 $3 }
      | Term '/' Factor         { Div $1 $3 }
      | Factor                  { Factor $1 }

Factor
      : int                     { Int $1 }
      | var                     { Var $1 }
      | '(' Exp ')'             { Brack $2 }
</programlisting>

    <indexterm>
      <primary>non-terminal</primary>
    </indexterm>
    <para>Each production consists of a <firstterm>non-terminal</firstterm>
    symbol on the left, followed by a colon, followed by one or more
    expansions on the right, separated by <literal>|</literal>.  Each expansion
    has some Haskell code associated with it, enclosed in braces as
    usual.</para>

    <para>The way to think about a parser is with each symbol having a
    <quote>value</quote>: we defined the values of the tokens above, and the
    grammar defines the values of non-terminal symbols in terms of
    sequences of other symbols (either tokens or non-terminals).  In a
    production like this:</para>

<programlisting>
n   : t_1 ... t_n   { E }
</programlisting>

    <para>whenever the parser finds the symbols <literal>t_1...t_n</literal> in
    the token stream, it constructs the symbol <literal>n</literal> and gives
    it the value <literal>E</literal>, which may refer to the values of
    <literal>t_1...t_n</literal> using the symbols
    <literal>&dollar;1...&dollar;n</literal>.</para>

    <para>The parser reduces the input using the rules in the grammar
    until just one symbol remains: the first symbol defined in the
    grammar (namely <literal>Exp</literal> in our example).  The value of this
    symbol is the return value from the parser.</para>

    <para>To complete the program, we need some extra code.  The
    grammar file may optionally contain a final code section, enclosed
    in curly braces.</para>

<programlisting>{</programlisting>

    <para>All parsers must include a function to be called in the
    event of a parse error.  In the <literal>%error</literal>
    directive earlier, we specified that the function to be called on
    a parse error is <literal>parseError</literal>:</para>

<programlisting>
parseError :: [Token] -> a
parseError _ = error "Parse error"
</programlisting>

    <para>Note that <literal>parseError</literal> must be polymorphic
    in its return type <literal>a</literal>, which usually means it
    must be a call to <literal>error</literal>.  We'll see in <xref
    linkend="sec-monads"/> how to wrap the parser in a monad so that we
    can do something more sensible with errors.  It's also possible to
    keep track of line numbers in the parser for use in error
    messages, this is described in <xref
    linkend="sec-line-numbers"/>.</para>

    <para>Next we can declare the data type that represents the parsed
    expression:</para>

<programlisting>
data Exp
      = Let String Exp Exp
      | Exp1 Exp1
      deriving Show

data Exp1
      = Plus Exp1 Term
      | Minus Exp1 Term
      | Term Term
      deriving Show

data Term
      = Times Term Factor
      | Div Term Factor
      | Factor Factor
      deriving Show

data Factor
      = Int Int
      | Var String
      | Brack Exp
      deriving Show
</programlisting>

    <para>And the data structure for the tokens...</para>

<programlisting>
data Token
      = TokenLet
      | TokenIn
      | TokenInt Int
      | TokenVar String
      | TokenEq
      | TokenPlus
      | TokenMinus
      | TokenTimes
      | TokenDiv
      | TokenOB
      | TokenCB
 deriving Show
</programlisting>

    <para>... and a simple lexer that returns this data
    structure.</para>

<programlisting>
lexer :: String -> [Token]
lexer [] = []
lexer (c:cs)
      | isSpace c = lexer cs
      | isAlpha c = lexVar (c:cs)
      | isDigit c = lexNum (c:cs)
lexer ('=':cs) = TokenEq : lexer cs
lexer ('+':cs) = TokenPlus : lexer cs
lexer ('-':cs) = TokenMinus : lexer cs
lexer ('*':cs) = TokenTimes : lexer cs
lexer ('/':cs) = TokenDiv : lexer cs
lexer ('(':cs) = TokenOB : lexer cs
lexer (')':cs) = TokenCB : lexer cs

lexNum cs = TokenInt (read num) : lexer rest
      where (num,rest) = span isDigit cs

lexVar cs =
   case span isAlpha cs of
      ("let",rest) -> TokenLet : lexer rest
      ("in",rest)  -> TokenIn : lexer rest
      (var,rest)   -> TokenVar var : lexer rest
</programlisting>

    <para>And finally a top-level function to take some input, parse
    it, and print out the result.</para>

<programlisting>
main = getContents >>= print . calc . lexer
}
</programlisting>

    <para>And that's it! A whole lexer, parser and grammar in a few
    dozen lines.  Another good example is <application>Happy</application>'s own
    parser. Several features in <application>Happy</application> were developed
    using this as an example.</para>

    <indexterm>
      <primary>info file</primary>
    </indexterm>

    <para>To generate the Haskell module for this parser, type the
    command <command>happy example.y</command> (where
    <filename>example.y</filename> is the name of the grammar file).
    The Haskell module will be placed in a file named
    <filename>example.hs</filename>.  Additionally, invoking the
    command <command>happy example.y -i</command> will produce the
    file <filename>example.info</filename> which contains detailed information
    about the parser, including states and reduction rules (see <xref
    linkend="sec-info-files"/>).  This can be invaluable for debugging
    parsers, but requires some knowledge of the operation of a
    shift-reduce parser. </para>

    <sect1 id="sec-other-datatypes">
      <title>Returning other datatypes</title>

      <para>In the above example, we used a data type to represent the
      syntax being parsed.  However, there's no reason why it has to
      be this way: you could calculate the value of the expression on
      the fly, using productions like this:</para>

<programlisting>
Term  : Term '*' Factor         { $1 * $3 }
      | Term '/' Factor         { $1 / $3 }
      | Factor                  { $1 }
</programlisting>

      <para>The value of a <literal>Term</literal> would be the value of the
      expression itself, and the parser could return an integer.  </para>

      <para>This works for simple expression types, but our grammar
      includes variables and the <literal>let</literal> syntax.  How do we know
      the value of a variable while we're parsing it?  We don't, but
      since the Haskell code for a production can be anything at all,
      we could make it a function that takes an environment of
      variable values, and returns the computed value of the
      expression:</para>

<programlisting>
Exp   : let var '=' Exp in Exp  { \p -> $6 (($2,$4 p):p) }
      | Exp1                    { $1 }

Exp1  : Exp1 '+' Term           { \p -> $1 p + $3 p }
      | Exp1 '-' Term           { \p -> $1 p - $3 p }
      | Term                    { $1 }

Term  : Term '*' Factor         { \p -> $1 p * $3 p }
      | Term '/' Factor         { \p -> $1 p `div` $3 p }
      | Factor                  { $1 }

Factor
      : int                     { \p -> $1 }
      | var                     { \p -> case lookup $1 p of
	                                    Nothing -> error "no var"
					    Just i  -> i }
      | '(' Exp ')'             { $2 }
</programlisting>

      <para>The value of each production is a function from an
      environment <emphasis>p</emphasis> to a value.  When parsing a
      <literal>let</literal> construct, we extend the environment with the new
      binding to find the value of the body, and the rule for
      <literal>var</literal> looks up its value in the environment.  There's
      something you can't do in <literal>yacc</literal> :-)</para>

    </sect1>

    <sect1 id="sec-sequences">
      <title>Parsing sequences</title>

      <para>A common feature in grammars is a <emphasis>sequence</emphasis> of a
      particular syntactic element.  In EBNF, we'd write something
      like <literal>n+</literal> to represent a sequence of one or more
      <literal>n</literal>s, and <literal>n*</literal> for zero or more.
      <application>Happy</application> doesn't support this syntax explicitly, but
      you can define the equivalent sequences using simple
      productions.</para>

      <para>For example, the grammar for <application>Happy</application> itself
      contains a rule like this:</para>

<programlisting>
prods : prod                   { [$1] }
      | prods prod             { $2 : $1 }
</programlisting>

      <para>In other words, a sequence of productions is either a
      single production, or a sequence of productions followed by a
      single production.  This recursive rule defines a sequence of
      one or more productions.</para>

      <para>One thing to note about this rule is that we used
      <emphasis>left recursion</emphasis> to define it - we could have written
      it like this:</para>

      <indexterm>
	<primary>recursion, left vs. right</primary>
      </indexterm>

<programlisting>
prods : prod                  { [$1] }
      | prod prods            { $1 : $2 }
</programlisting>

      <para>The only reason we used left recursion is that
      <application>Happy</application> is more efficient at parsing left-recursive
      rules; they result in a constant stack-space parser, whereas
      right-recursive rules require stack space proportional to the
      length of the list being parsed.  This can be extremely
      important where long sequences are involved, for instance in
      automatically generated output.  For example, the parser in GHC
      used to use right-recursion to parse lists, and as a result it
      failed to parse some <application>Happy</application>-generated modules due
      to running out of stack space!</para>

      <para>One implication of using left recursion is that the resulting
      list comes out reversed, and you have to reverse it again to get
      it in the original order.  Take a look at the
      <application>Happy</application> grammar for Haskell for many examples of
      this.</para>

      <para>Parsing sequences of zero or more elements requires a
      trivial change to the above pattern:</para>

<programlisting>
prods : {- empty -}           { [] }
      | prods prod            { $2 : $1 }
</programlisting>

      <para>Yes - empty productions are allowed.  The normal
      convention is to include the comment <literal>{- empty -}</literal> to
      make it more obvious to a reader of the code what's going
      on.</para>

      <sect2 id="sec-separators">
	<title>Sequences with separators</title>

	<para>A common type of sequence is one with a
        <emphasis>separator</emphasis>: for instance function bodies in C
        consist of statements separated by semicolons.  To parse this
        kind of sequence we use a production like this:</para>

<programlisting>
stmts : stmt                   { [$1] }
      | stmts ';' stmt         { $3 : $1 }
</programlisting>

	<para>If the <literal>;</literal> is to be a <emphasis>terminator</emphasis>
        rather than a separator (i.e. there should be one following
        each statement), we can remove the semicolon from the above
        rule and redefine <literal>stmt</literal> as</para>

<programlisting>
stmt : stmt1 ';'              { $1 }
</programlisting>

	<para>where <literal>stmt1</literal> is the real definition of statements.</para>

        <para>We might like to allow extra semicolons between
        statements, to be a bit more liberal in what we allow as legal
        syntax.  We probably just want the parser to ignore these
        extra semicolons, and not generate a ``null statement'' value
        or something.  The following rule parses a sequence of zero or
        more statements separated by semicolons, in which the
        statements may be empty:</para>

<programlisting>
stmts : stmts ';' stmt          { $3 : $1 }
      | stmts ';'               { $1 }
      | stmt			{ [$1] }
      | {- empty -}		{ [] }
</programlisting>

	<para>Parsing sequences of <emphasis>one</emphasis> or more possibly
	null statements is left as an exercise for the reader...</para>

    </sect2>
    </sect1>

<!--
    <sect1 id="sec-ambiguities">
      <title>Ambiguities</title>

      <para>(section under construction)</para>

    </sect1>
-->

    <sect1 id="sec-Precedences">
      <title>Using Precedences</title>
      <indexterm><primary>precedences</primary></indexterm>
      <indexterm><primary>associativity</primary></indexterm>

      <para>Going back to our earlier expression-parsing example,
      wouldn't it be nicer if we didn't have to explicitly separate
      the expressions into terms and factors, merely to make it
      clear that <literal>'*'</literal> and <literal>'/'</literal>
      operators bind more tightly than <literal>'+'</literal> and
      <literal>'-'</literal>?</para>

      <para>We could just change the grammar as follows (making the
      appropriate changes to the expression datatype too):</para>

<programlisting>
Exp   : let var '=' Exp in Exp  { Let $2 $4 $6 }
      | Exp '+' Exp             { Plus $1 $3 }
      | Exp '-' Exp             { Minus $1 $3 }
      | Exp '*' Exp             { Times $1 $3 }
      | Exp '/' Exp             { Div $1 $3 }
      | '(' Exp ')'             { Brack $2 }
      | int                     { Int $1 }
      | var                     { Var $1 }
</programlisting>

      <para>but now Happy will complain that there are shift/reduce
      conflicts because the grammar is ambiguous - we haven't
      specified whether e.g. <literal>1 + 2 * 3</literal> is to be
      parsed as <literal>1 + (2 * 3)</literal> or <literal>(1 + 2) *
      3</literal>.  Happy allows these ambiguities to be resolved by
      specifying the <firstterm>precedences</firstterm> of the
      operators involved using directives in the
      header<footnote><para>Users of <literal>yacc</literal> will find
      this familiar, Happy's precedence scheme works in exactly the
      same way.</para></footnote>:</para>

<programlisting>
...
%right in
%left '+' '-'
%left '*' '/'
%%
...
</programlisting>
<indexterm><primary><literal>%left</literal> directive</primary></indexterm>
<indexterm><primary><literal>%right</literal> directive</primary></indexterm>
<indexterm><primary><literal>%nonassoc</literal> directive</primary></indexterm>

      <para>The <literal>%left</literal> or <literal>%right</literal>
      directive is followed by a list of terminals, and declares all
      these tokens to be left or right-associative respectively.  The
      precedence of these tokens with respect to other tokens is
      established by the order of the <literal>%left</literal> and
      <literal>%right</literal> directives: earlier means lower
      precedence.  A higher precedence causes an operator to bind more
      tightly; in our example above, because <literal>'*'</literal>
      has a higher precedence than <literal>'+'</literal>, the
      expression <literal>1 + 2 * 3</literal> will parse as <literal>1
      + (2 * 3)</literal>.</para>

      <para>What happens when two operators have the same precedence?
      This is when the <firstterm>associativity</firstterm> comes into
      play.  Operators specified as left associative will cause
      expressions like <literal>1 + 2 - 3</literal> to parse as
      <literal>(1 + 2) - 3</literal>, whereas right-associative
      operators would parse as <literal>1 + (2 - 3)</literal>.  There
      is also a <literal>%nonassoc</literal> directive which indicates
      that the specified operators may not be used together.  For
      example, if we add the comparison operators
      <literal>'>'</literal> and <literal>'&lt;'</literal> to our
      grammar, then we would probably give their precedence as:</para>

<programlisting>...
%right in
%nonassoc '>' '&lt;'
%left '+' '-'
%left '*' '/'
%%
...</programlisting>

      <para>which indicates that <literal>'>'</literal> and
      <literal>'&lt;'</literal> bind less tightly than the other
      operators, and the non-associativity causes expressions such as
      <literal>1 > 2 > 3</literal> to be disallowed.</para>

      <sect2 id="how-precedence-works">
	<title>How precedence works</title>

	<para>The precedence directives, <literal>%left</literal>,
	<literal>%right</literal> and <literal>%nonassoc</literal>,
	assign precedence levels to the tokens in the declaration.  A
	rule in the grammar may also have a precedence: if the last
	terminal in the right hand side of the rule has a precedence,
	then this is the precedence of the whole rule.</para>

	<para>The precedences are used to resolve ambiguities in the
	grammar.  If there is a shift/reduce conflict, then the
	precedence of the rule and the lookahead token are examined in
	order to resolve the conflict:</para>

	<itemizedlist>
	  <listitem>
	    <para>If the precedence of the rule is higher, then the
	    conflict is resolved as a reduce.</para>
	  </listitem>
	  <listitem>
	    <para>If the precedence of the lookahead token is higher,
	    then the conflict is resolved as a shift.</para>
	  </listitem>
	  <listitem>
	    <para>If the precedences are equal, then</para>
	    <itemizedlist>
		<listitem>
		<para>If the token is left-associative, then reduce</para>
	      </listitem>
	      <listitem>
		<para>If the token is right-associative, then shift</para>
	      </listitem>
	      <listitem>
		<para>If the token is non-associative, then fail</para>
	      </listitem>
	    </itemizedlist>
	  </listitem>
	  <listitem>
	    <para>If either the rule or the token has no precedence,
	    then the default is to shift (these conflicts are reported
	    by Happy, whereas ones that are automatically resolved by
	    the precedence rules are not).</para>
	  </listitem>
	</itemizedlist>
      </sect2>

      <sect2 id="context-precedence">
	<title>Context-dependent Precedence</title>

	<para>The precedence of an individual rule can be overriden,
	using <firstterm>context precedence</firstterm>.  This is
	useful when, for example, a particular token has a different
	precedence depending on the context.  A common example is the
	minus sign: it has high precedence when used as prefix
	negation, but a lower precedence when used as binary
	subtraction.</para>

	<para>We can implement this in Happy as follows:</para>

<programlisting>%right in
%nonassoc '>' '&lt;'
%left '+' '-'
%left '*' '/'
%left NEG
%%

Exp   : let var '=' Exp in Exp  { Let $2 $4 $6 }
      | Exp '+' Exp             { Plus $1 $3 }
      | Exp '-' Exp             { Minus $1 $3 }
      | Exp '*' Exp             { Times $1 $3 }
      | Exp '/' Exp             { Div $1 $3 }
      | '(' Exp ')'             { Brack $2 }
      | '-' Exp %prec NEG       { Negate $2 }
      | int                     { Int $1 }
      | var                     { Var $1 }</programlisting>
<indexterm><primary><literal>%prec</literal> directive</primary></indexterm>

	<para>We invent a new token <literal>NEG</literal> as a
	placeholder for the precedence of our prefix negation rule.
	The <literal>NEG</literal> token doesn't need to appear in
	a <literal>%token</literal> directive.  The prefix negation
	rule has a <literal>%prec NEG</literal> directive attached,
	which overrides the default precedence for the rule (which
	would normally be the precedence of '-') with the precedence
	of <literal>NEG</literal>.</para>
      </sect2>
    </sect1>

    <sect1 id="sec-type-signatures">
      <title>Type Signatures</title>

      <indexterm>
	<primary>type</primary>
	<secondary>signatures in grammar</secondary>
      </indexterm>

      <para><application>Happy</application> allows you to include type signatures
      in the grammar file itself, to indicate the type of each
      production.  This has several benefits:</para>

      <itemizedlist>
	<listitem>
	  <para> Documentation: including types in the grammar helps
          to document the grammar for someone else (and indeed
          yourself) reading the code.</para>
	</listitem>

	<listitem>
	  <para> Fixing type errors in the generated module can become
          slightly easier if <application>Happy</application> has inserted type
          signatures for you.  This is a slightly dubious benefit,
          since type errors in the generated module are still somewhat
          difficult to find.  </para>
	</listitem>

	<listitem>
	  <para> Type signatures generally help the Haskell compiler
          to compile the parser faster.  This is important when really
          large grammar files are being used.</para>
	</listitem>
      </itemizedlist>

      <para>The syntax for type signatures in the grammar file is as
      follows:</para>

<programlisting>
stmts   :: { [ Stmt ] }
stmts   : stmts stmt                { $2 : $1 }
	| stmt                      { [$1] }
</programlisting>

      <para>In fact, you can leave out the superfluous occurrence of
      <literal>stmts</literal>:</para>

<programlisting>
stmts   :: { [ Stmt ] }
	: stmts stmt                { $2 : $1 }
	| stmt                      { [$1] }
</programlisting>

      <para>Note that currently, you have to include type signatures
      for <emphasis>all</emphasis> the productions in the grammar to benefit
      from the second and third points above.  This is due to boring
      technical reasons, but it is hoped that this restriction can be
      removed in the future.</para>

      <para>It is possible to have productions with polymorphic or overloaded
	types.  However, because the type of each production becomes the
	argument type of a constructor in an algebraic datatype in the
	generated source file, compiling the generated file requires a compiler
	that supports local universal quantification.  GHC (with the
	<option>-fglasgow-exts</option> option) and Hugs are known to support
	this.</para>
    </sect1>

    <sect1 id="sec-monads">
      <title>Monadic Parsers</title>

      <indexterm>
	<primary>monadic</primary>
	<secondary>parsers</secondary>
      </indexterm>

      <para><application>Happy</application> has support for threading a monad
      through the generated parser.  This might be useful for several
      reasons:</para>

      <itemizedlist>

	<listitem>
          <para> Handling parse errors
	  <indexterm>
	    <primary>parse errors</primary>
	    <secondary>handling</secondary>
	  </indexterm>
<!--	  <indexterm>
	    <primary>error</primary>
	    <secondary>parse</secondary>
	    <see>parse errors</see>
	  </indexterm>
-->
	  by using an exception monad
          (see <xref linkend="sec-exception"/>).</para>
	</listitem>

	<listitem>
          <para> Keeping track of line numbers
	  <indexterm>
	    <primary>line numbers</primary>
	  </indexterm>
	  in the input file, for
          example for use in error messages (see <xref
          linkend="sec-line-numbers"/>).</para>
	</listitem>

	<listitem>
	  <para> Performing IO operations during parsing.</para>
	</listitem>

	<listitem>
	  <para> Parsing languages with context-dependencies (such as
          C) require some state in the parser.</para>
	</listitem>

</itemizedlist>

      <para>Adding monadic support to your parser couldn't be simpler.
      Just add the following directive to the declaration section of
      the grammar file:</para>

<programlisting>
%monad { &lt;type&gt; } [ { &lt;then&gt; } { &lt;return&gt; } ]
</programlisting>

      <indexterm>
	<primary><literal>%monad</literal></primary>
      </indexterm>

      <para>where <literal>&lt;type&gt;</literal> is the type constructor for
      the monad, <literal>&lt;then&gt;</literal> is the bind operation of the
      monad, and <literal>&lt;return&gt;</literal> is the return operation. If
      you leave out the names for the bind and return operations,
      <application>Happy</application> assumes that <literal>&lt;type&gt;</literal> is an
      instance of the standard Haskell type class <literal>Monad</literal> and
      uses the overloaded names for the bind and return
      operations.</para>

      <para>When this declaration is included in the grammar,
      <application>Happy</application> makes a couple of changes to the generated
      parser: the types of the main parser function and
      <literal>parseError</literal> (the function named in
      <literal>%error</literal>) become <literal>[Token] -&gt; P a</literal> where
      <literal>P</literal> is the monad type constructor, and the function must
      be polymorphic in <literal>a</literal>.  In other words,
      <application>Happy</application> adds an application of the
      <literal>&lt;return&gt;</literal> operation defined in the declaration
      above, around the result of the parser (<literal>parseError</literal> is
      affected because it must have the same return type as the
      parser).  And that's all it does.</para>

      <para>This still isn't very useful: all you can do is return
      something of monadic type from <literal>parseError</literal>.  How do you
      specify that the productions can also have type <literal>P a</literal>?
      Most of the time, you don't want a production to have this type:
      you'd have to write explicit <literal>returnP</literal>s everywhere.
      However, there may be a few rules in a grammar that need to get
      at the monad, so <application>Happy</application> has a special syntax for
      monadic actions:</para>

<programlisting>
n  :  t_1 ... t_n          {% &lt;expr&gt; }
</programlisting>

      <indexterm>
	<primary>monadic</primary>
	<secondary>actions</secondary>
      </indexterm>
      <para>The <literal>%</literal> in the action indicates that this is a
      monadic action, with type <literal>P a</literal>, where <literal>a</literal> is
      the real return type of the production.  When
      <application>Happy</application> reduces one of these rules, it evaluates the
      expression </para>

<programlisting>
&lt;expr&gt; `then` \result -> &lt;continue parsing&gt;
</programlisting>

      <para><application>Happy</application> uses <literal>result</literal> as the real
      semantic value of the production.  During parsing, several
      monadic actions might be reduced, resulting in a sequence
      like</para>

<programlisting>
&lt;expr1&gt; `then` \r1 ->
&lt;expr2&gt; `then` \r2 ->
...
return &lt;expr3&gt;
</programlisting>

      <para>The monadic actions are performed in the order that they
      are <emphasis>reduced</emphasis>.  If we consider the parse as a tree,
      then reductions happen in a depth-first left-to-right manner.
      The great thing about adding a monad to your parser is that it
      doesn't impose any performance overhead for normal reductions -
      only the monadic ones are translated like this.</para>

      <para>Take a look at the Haskell parser for a good illustration
      of how to use a monad in your parser: it contains examples of
      all the principles discussed in this section, namely parse
      errors, a threaded lexer, line/column numbers, and state
      communication between the parser and lexer.</para>

      <para>The following sections consider a couple of uses for
      monadic parsers, and describe how to also thread the monad
      through the lexical analyser.</para>

      <sect2 id="sec-exception">
	<title>Handling Parse Errors</title>
	<indexterm>
	  <primary>parse errors</primary>
	  <secondary>handling</secondary>
	</indexterm>

      <para>It's not very convenient to just call <literal>error</literal> when
      a parse error is detected: in a robust setting, you'd like the
      program to recover gracefully and report a useful error message
      to the user.  Exceptions (of which errors are a special case)
      are normally implemented in Haskell by using an exception monad,
      something like:</para>

<programlisting>
data E a = Ok a | Failed String

thenE :: E a -> (a -> E b) -> E b
m `thenE` k =
   case m of
       Ok a -> k a
	 Failed e -> Failed e

returnE :: a -> E a
returnE a = Ok a

failE :: String -> E a
failE err = Failed err

catchE :: E a -> (String -> E a) -> E a
catchE m k =
   case m of
      Ok a -> OK a
	Failed e -> k e
</programlisting>

	<para>This monad just uses a string as the error type.  The
        functions <literal>thenE</literal> and <literal>returnE</literal> are the usual
        bind and return operations of the monad, <literal>failE</literal>
        raises an error, and <literal>catchE</literal> is a combinator for
        handling exceptions.</para>

	<para>We can add this monad to the parser with the declaration</para>

<programlisting>
%monad { E } { thenE } { returnE }
</programlisting>

	<para>Now, without changing the grammar, we can change the
        definition of <literal>parseError</literal> and have something sensible
        happen for a parse error:</para>

<programlisting>
parseError tokens = failE "Parse error"
</programlisting>

	<para>The parser now raises an exception in the monad instead
	of bombing out on a parse error.</para>

	<para>We can also generate errors during parsing.  There are
        times when it is more convenient to parse a more general
        language than that which is actually intended, and check it
        later.  An example comes from Haskell, where the precedence
        values in infix declarations must be between 0 and 9:</para>

<programlisting>prec :: { Int }
      : int    {% if $1 &lt; 0 || $1 > 9
	                then failE "Precedence out of range"
		        else returnE $1
		}</programlisting>

	<para>The monadic action allows the check to be placed in the
	parser itself, where it belongs.</para>

    </sect2>

    <sect2 id="sec-lexers">
      <title>Threaded Lexers</title>
	<indexterm>
	  <primary>lexer, threaded</primary>
	</indexterm>
	<indexterm>
	  <primary>monadic</primary>
	  <secondary>lexer</secondary>
	</indexterm>

	<para><application>Happy</application> allows the monad concept to be
	extended to the lexical analyser, too.  This has several
	useful consequences:</para>

	<itemizedlist>
	  <listitem>
	    <para> Lexical errors can be treated in the same way as
            parse errors, using an exception monad.</para>
	    <indexterm>
	      <primary>parse errors</primary>
	      <secondary>lexical</secondary>
	    </indexterm>
	  </listitem>
	  <listitem>
	    <para> Information such as the current file and line
            number can be communicated between the lexer and
            parser. </para>
	  </listitem>
	  <listitem>
	    <para> General state communication between the parser and
            lexer - for example, implementation of the Haskell layout
            rule requires this kind of interaction.
            </para>
	  </listitem>
	  <listitem>
	    <para> IO operations can be performed in the lexer - this
            could be useful for following import/include declarations
            for instance.</para>
	  </listitem>
	</itemizedlist>

	<para>A monadic lexer is requested by adding the following
	declaration to the grammar file:</para>

<programlisting>
%lexer { &lt;lexer&gt; } { &lt;eof&gt; }
</programlisting>

	<indexterm>
	  <primary><literal>%lexer</literal></primary>
	</indexterm>

	<para>where <literal>&lt;lexer&gt;</literal> is the name of the lexical
        analyser function, and <literal>&lt;eof&gt;</literal> is a token that
        is to be treated as the end of file.</para>

	<para>When using a monadic lexer, the parser no longer reads a
        list of tokens.  Instead, it calls the lexical analysis
        function for each new token to be read.  This has the side
        effect of eliminating the intermediate list of tokens, which
        is a slight performance win.</para>

	<para>The type of the main parser function is now just
        <literal>P a</literal> - the input is being handled completely
        within the monad.</para>

	<para>The type of <literal>parseError</literal> becomes
	<literal>Token -&gt; P a</literal>; that is it takes Happy's
	current lookahead token as input.  This can be useful, because
	the error function probably wants to report the token at which
	the parse error occurred, and otherwise the lexer would have
	to store this token in the monad.</para>

	<para>The lexical analysis function must have the following
	type:</para>

<programlisting>
lexer :: (Token -> P a) -> P a
</programlisting>

	<para>where <literal>P</literal> is the monad type constructor declared
        with <literal>%monad</literal>, and <literal>a</literal> can be replaced by the
        parser return type if desired.</para>

	<para>You can see from this type that the lexer takes a
        <emphasis>continuation</emphasis> as an argument.  The lexer is to find
        the next token, and pass it to this continuation to carry on
        with the parse.  Obviously, we need to keep track of the input
        in the monad somehow, so that the lexer can do something
        different each time it's called!</para>

	<para>Let's take the exception monad above, and extend it to
        add the input string so that we can use it with a threaded
        lexer.</para>

<programlisting>
data ParseResult a = Ok a | Failed String
type P a = String -> ParseResult a

thenP :: P a -> (a -> P b) -> P b
m `thenP` k = \s ->
   case m s of
       Ok a -> k a s
	 Failed e -> Failed e

returnP :: a -> P a
returnP a = \s -> Ok a

failP :: String -> P a
failP err = \s -> Failed err

catchP :: P a -> (String -> P a) -> P a
catchP m k = \s ->
   case m s of
      Ok a -> OK a
	Failed e -> k e s
</programlisting>

	<para>Notice that this isn't a real state monad - the input
        string just gets passed around, not returned.  Our lexer will
        now look something like this:</para>

<programlisting>
lexer :: (Token -> P a) -> P a
lexer cont s =
    ... lexical analysis code ...
    cont token s'
</programlisting>

	<para>the lexer grabs the continuation and the input string,
        finds the next token <literal>token</literal>, and passes it together
        with the remaining input string <literal>s'</literal> to the
        continuation.</para>

	<para>We can now indicate lexical errors by ignoring the
        continuation and calling <literal>failP "error message" s</literal>
        within the lexer (don't forget to pass the input string to
        make the types work out).</para>

	<para>This may all seem a bit weird.  Why, you ask, doesn't
        the lexer just have type <literal>P Token</literal>?  It was
        done this way for performance reasons - this formulation
        sometimes means that you can use a reader monad instead of a
        state monad for <literal>P</literal>, and the reader monad
        might be faster.  It's not at all clear that this reasoning
        still holds (or indeed ever held), and it's entirely possible
        that the use of a continuation here is just a
        misfeature.</para>

        <para>If you want a lexer of type <literal>P Token</literal>,
        then just define a wrapper to deal with the
        continuation:</para>

<programlisting>
lexwrap :: (Token -> P a) -> P a
lexwrap cont = real_lexer `thenP` \token -> cont token
</programlisting>

      <sect3>
	<title>Monadic productions with %lexer</title>

	<para>The <literal>{% ... }</literal> actions work fine with
	<literal>%lexer</literal>, but additionally there are two more
	forms which are useful in certain cases.  Firstly:</para>

<programlisting>
n  :  t_1 ... t_n          {%^ &lt;expr&gt; }
</programlisting>

	<para>In this case, <literal>&lt;expr&gt;</literal> has type
	<literal>Token -> P a</literal>.  That is, Happy passes the
	current lookahead token to the monadic action
	<literal>&lt;expr&gt;</literal>.  This is a useful way to get
	hold of Happy's current lookahead token without having to
	store it in the monad.</para>

<programlisting>
n  :  t_1 ... t_n          {%% &lt;expr&gt; }
</programlisting>

	<para>This is a slight variant on the previous form.  The type
	of <literal>&lt;expr&gt;</literal> is the same, but in this
	case the lookahead token is actually discarded and a new token
	is read from the input.  This can be useful when you want to
	change the next token and continue parsing.</para>
      </sect3>
    </sect2>

    <sect2 id="sec-line-numbers">
      <title>Line Numbers</title>

	<indexterm>
	  <primary>line numbers</primary>
	</indexterm>

	<indexterm>
	  <primary><literal>%newline</literal></primary>
	</indexterm>
	<para>Previous versions of <application>Happy</application> had a
        <literal>%newline</literal> directive that enabled simple line numbers
        to be counted by the parser and referenced in the actions.  We
        warned you that this facility may go away and be replaced by
        something more general, well guess what? :-)</para>

	<para>Line numbers can now be dealt with quite
        straightforwardly using a monadic parser/lexer combination.
        Ok, we have to extend the monad a bit more:</para>

<programlisting>
type LineNumber = Int
type P a = String -> LineNumber -> ParseResult a

getLineNo :: P LineNumber
getLineNo = \s l -> Ok l
</programlisting>

	<para>(the rest of the functions in the monad follow by just
        adding the extra line number argument in the same way as the
        input string).  Again, the line number is just passed down,
        not returned: this is OK because of the continuation-based
        lexer that can change the line number and pass the new one to
        the continuation.</para>

	<para>The lexer can now update the line number as follows:</para>

<programlisting>
lexer cont s =
  case s of
     '\n':s  ->  \line -> lexer cont s (line + 1)
     ... rest of lexical analysis ...
</programlisting>

	<para>It's as simple as that.  Take a look at
        <application>Happy</application>'s own parser if you have the sources lying
        around, it uses a monad just like the one above.</para>

        <para>Reporting the line number of a parse error is achieved
        by changing <literal>parseError</literal> to look something like
        this:</para>

<programlisting>
parseError :: Token -> P a
parseError = getLineNo `thenP` \line ->
             failP (show line ++ ": parse error")
</programlisting>

	<para>We can also get hold of the line number during parsing,
        to put it in the parsed data structure for future reference.
        A good way to do this is to have a production in the grammar
        that returns the current line number: </para>

<programlisting>lineno :: { LineNumber }
        : {- empty -}      {% getLineNo }</programlisting>

	<para>The semantic value of <literal>lineno</literal> is the line
        number of the last token read - this will always be the token
        directly following the <literal>lineno</literal> symbol in the grammar,
        since <application>Happy</application> always keeps one lookahead token in
        reserve.</para>

      </sect2>

      <sect2 id="sec-monad-summary">
	<title>Summary</title>

	<para>The types of various functions related to the parser are
        dependent on what combination of <literal>%monad</literal> and
        <literal>%lexer</literal> directives are present in the grammar.  For
        reference, we list those types here.  In the following types,
        <emphasis>t</emphasis> is the return type of the
        parser.  A type containing a type variable indicates that the
        specified function must be polymorphic.</para>

	<indexterm>
	  <primary>type</primary>
	  <secondary>of <function>parseError</function></secondary>
	</indexterm>
	<indexterm>
	  <primary>type</primary>
	  <secondary>of parser</secondary>
	</indexterm>
	<indexterm>
	  <primary>type</primary>
	  <secondary>of lexer</secondary>
	</indexterm>

	<itemizedlist>
	  <listitem>
	    <formalpara>
	      <title> No <literal>&percnt;monad</literal> or
	      <literal>&percnt;lexer</literal> </title>
	      <para>
<programlisting>
parse      :: [Token] -> <emphasis>t</emphasis>
parseError :: [Token] -> a
</programlisting>
</para>
	    </formalpara>
	  </listitem>

	  <listitem>
	    <formalpara>
	      <title> with <literal>%monad</literal> </title>
	      <para>
<programlisting>
parse      :: [Token] -> P <emphasis>t</emphasis>
parseError :: [Token] -> P a
</programlisting>
</para>
	    </formalpara>
	  </listitem>


	  <listitem>
	    <formalpara>
	      <title> with <literal>%lexer</literal> </title>
	      <para><programlisting>
parse      :: T <emphasis>t</emphasis>
parseError :: Token -> T a
lexer      :: (Token -> T a) -> T a
</programlisting>
where the type constructor <literal>T</literal> is whatever you want (usually <literal>T
a = String -> a</literal>.  I'm not sure if this is useful, or even if it works
properly.</para>
	    </formalpara>
	  </listitem>

	  <listitem>
	    <formalpara>
	      <title> with <literal>%monad</literal> and <literal>%lexer</literal> </title>
	      <para><programlisting>
parse      :: P <emphasis>t</emphasis>
parseError :: Token -> P a
lexer      :: (Token -> P a) -> P a
</programlisting>
</para>
	    </formalpara>
	  </listitem>
	</itemizedlist>

      </sect2>
    </sect1>

    <sect1 id="sec-error">
      <title>The Error Token</title>
      <indexterm>
	<primary>error token</primary>
      </indexterm>

      <para><application>Happy</application> supports a limited form of error
      recovery, using the special symbol <literal>error</literal> in a grammar
      file.  When <application>Happy</application> finds a parse error during
      parsing, it automatically inserts the <literal>error</literal> symbol; if
      your grammar deals with <literal>error</literal> explicitly, then it can
      detect the error and carry on.</para>

      <para>For example, the <application>Happy</application> grammar for Haskell
      uses error recovery to implement Haskell layout.  The grammar
      has a rule that looks like this:</para>

<programlisting>
close : '}'                  { () }
      | error		     { () }
</programlisting>

      <para>This says that a close brace in a layout-indented context
      may be either a curly brace (inserted by the lexical analyser),
      or a parse error.  </para>

      <para>This rule is used to parse expressions like <literal>let x
      = e in e'</literal>: the layout system inserts an open brace before
      <literal>x</literal>, and the occurrence of the <literal>in</literal> symbol
      generates a parse error, which is interpreted as a close brace
      by the above rule.</para>

      <indexterm>
	<primary><application>yacc</application></primary>
      </indexterm>
      <para>Note for <literal>yacc</literal> users: this form of error recovery
      is strictly more limited than that provided by <literal>yacc</literal>.
      During a parse error condition, <literal>yacc</literal> attempts to
      discard states and tokens in order to get back into a state
      where parsing may continue; <application>Happy</application> doesn't do this.
      The reason is that normal <literal>yacc</literal> error recovery is
      notoriously hard to describe, and the semantics depend heavily
      on the workings of a shift-reduce parser.  Furthermore,
      different implementations of <literal>yacc</literal> appear to implement
      error recovery differently.  <application>Happy</application>'s limited error
      recovery on the other hand is well-defined, as is just
      sufficient to implement the Haskell layout rule (which is why it
      was added in the first place).</para>
    </sect1>

    <sect1 id="sec-multiple-parsers">
      <title>Generating Multiple Parsers From a Single Grammar</title>
      <indexterm>
	<primary>multiple parsers</primary>
      </indexterm>

      <para>It is often useful to use a single grammar to describe
      multiple parsers, where each parser has a different top-level
      non-terminal, but parts of the grammar are shared between
      parsers.  A classic example of this is an interpreter, which
      needs to be able to parse both entire files and single
      expressions: the expression grammar is likely to be identical
      for the two parsers, so we would like to use a single grammar
      but have two entry points.</para>

      <para><application>Happy</application> lets you do this by
      allowing multiple <literal>%name</literal> directives in the
      grammar file.  The <literal>%name</literal> directive takes an
      optional second parameter specifying the top-level
      non-terminal for this parser, so we may specify multiple parsers
      like so:</para>
      <indexterm><primary><literal>%name</literal> directive</primary>
      </indexterm>

<programlisting>
%name parse1 non-terminal1
%name parse2 non-terminal2
</programlisting>

      <para><application>Happy</application> will generate from this a
      module which defines two functions <function>parse1</function>
      and <function>parse2</function>, which parse the grammars given
      by <literal>non-terminal1</literal> and
      <literal>non-terminal2</literal> respectively.  Each parsing
      function will of course have a different type, depending on the
      type of the appropriate non-terminal.</para>
    </sect1>

  </chapter>

  <chapter id="sec-glr">

    <chapterinfo>
      <copyright>
        <year>2004</year>
        <holder>University of Durham, Paul Callaghan, Ben Medlock</holder>
      </copyright>
    </chapterinfo>

    <title>Generalized LR Parsing</title>

    <para>This chapter explains how to use the GLR parsing extension,
    which allows <application>Happy</application> to parse ambiguous
    grammars and produce useful results.
    This extension is triggered with the <option>--glr</option> flag,
    which causes <application>Happy</application>
    to use a different driver for the LALR(1) parsing
    tables. The result of parsing is a structure which encodes compactly
    <emphasis>all</emphasis> of the possible parses.
    There are two options for how semantic information is combined with
    the structural information.
    </para>

    <para>
    This extension was developed by Paul Callaghan and Ben Medlock
    (University of Durham). It is based on the structural parser
    implemented in Medlock's undergraduate project, but significantly
    extended and improved by Callaghan.
    Bug reports, comments, questions etc should be sent to
    <email>P.C.Callaghan@durham.ac.uk</email>.
    Further information can be found on Callaghan's
    <ulink url="http://www.dur.ac.uk/p.c.callaghan/happy-glr">GLR parser
    page</ulink>.


    </para>

    <sect1 id="sec-glr-intro">
      <title>Introduction</title>

      <para>
      Here's an ambiguous grammar. It has no information about the
      associativity of <literal>+</literal>, so for example,
      <literal>1+2+3</literal> can be parsed as
      <literal>(1+(2+3))</literal> or <literal>((1+2)+3)</literal>.
      In conventional mode, <application>Happy</application>,
      would complain about a shift/reduce
      conflict, although it would generate a parser which always shifts
      in such a conflict, and hence would produce <emphasis>only</emphasis>
      the first alternative above.
      </para>

<programlisting>
E -> E + E
E -> i       -- any integer
</programlisting>

      <para>
      GLR parsing will accept this grammar without complaint, and produce
      a result which encodes <emphasis>both</emphasis> alternatives
      simultaneously. Now consider the more interesting example of
      <literal>1+2+3+4</literal>, which has five distinct parses -- try to
      list them! You will see that some of the subtrees are identical.
      A further property of the GLR output is that such sub-results are
      shared, hence efficiently represented: there is no combinatorial
      explosion.
      Below is the simplified output of the GLR parser for this example.
      </para>

<programlisting>
Root (0,7,G_E)
(0,1,G_E)     => [[(0,1,Tok '1'))]]
(0,3,G_E)     => [[(0,1,G_E),(1,2,Tok '+'),(2,3,G_E)]]
(0,5,G_E)     => [[(0,1,G_E),(1,2,Tok '+'),(2,5,G_E)]
                  ,[(0,3,G_E),(3,4,Tok '+'),(4,5,G_E)]]
(0,7,G_E)     => [[(0,3,G_E),(3,4,Tok '+'),(4,7,G_E)]
                  ,[(0,1,G_E),(1,2,Tok '+'),(2,7,G_E)]
                  ,[(0,5,G_E),(5,6,Tok '+'),(6,7,G_E)]}]
(2,3,G_E)     => [[(2,3,Tok '2'))]}]
(2,5,G_E)     => [[(2,3,G_E),(3,4,Tok '+'),(4,5,G_E)]}]
(2,7,G_E)     => [[(2,3,G_E),(3,4,Tok '+'),(4,7,G_E)]}
                  ,[(2,5,G_E),(5,6,Tok '+'),(6,7,G_E)]}]
(4,5,G_E)     => [[(4,5,Tok '3'))]}]
(4,7,G_E)     => [[(4,5,G_E),(5,6,Tok '+'),(6,7,G_E)]}]
(6,7,G_E)     => [[(6,7,Tok '4'))]}]
</programlisting>

      <para>
      This is a directed, acyclic and-or graph.
      The node "names" are of form <literal>(a,b,c)</literal>
      where <literal>a</literal> and <literal>b</literal>
      are the start and end points (as positions in the input string)
      and <literal>c</literal> is a category (or name of grammar rule).
      For example <literal>(2,7,G_E)</literal> spans positions 2 to 7
      and contains analyses which match the <literal>E</literal>
      grammar rule.
      Such analyses are given as a list of alternatives (disjunctions),
      each corresponding to some use of a production of that
      category, which in turn are a conjunction of sub-analyses,
      each represented as a node in the graph or an instance of a token.
      </para>

      <para>
      Hence <literal>(2,7,G_E)</literal> contains two alternatives,
      one which has <literal>(2,3,G_E)</literal> as its first child
      and the other with <literal>(2,5,G_E)</literal> as its first child,
      respectively corresponding to sub-analyses
      <literal>(2+(3+4))</literal> and <literal>((2+3)+4)</literal>.
      Both alternatives have the token <literal>+</literal> as their
      second child, but note that they are difference occurrences of
      <literal>+</literal> in the input!
      We strongly recommend looking at such results in graphical form
      to understand these points. If you build the
      <literal>expr-eval</literal> example in the directory
      <literal>examples/glr</literal> (NB you need to use GHC for this,
      unless you know how to use the <option>-F</option> flag for Hugs),
      running the example will produce a file which can be viewed with
      the <emphasis>daVinci</emphasis> graph visualization tool.
      (See <ulink url="http://www.informatik.uni-bremen.de/~davinci/"/>
       for more information. Educational use licenses are currently
	available without charge.)
      </para>

      <para>
      The GLR extension also allows semantic information to be attached
      to productions, as in conventional <application>Happy</application>,
      although there are further issues to consider.
      Two modes are provided, one for simple applications and one for more
      complex use.
      See <xref linkend="sec-glr-semantics"/>.
      The extension is also integrated with <application>Happy</application>'s
      token handling, e.g. extraction of information from tokens.
      </para>

      <para>
      One key feature of this implementation in Haskell is that its main
      result is a <emphasis>graph</emphasis>.
      Other implementations effectively produce a list of trees, but this
      limits practical use to small examples.
      For large and interesting applications, some of which are discussed
      in <xref linkend="sec-glr-misc-applications"/>, a graph is essential due
      to the large number of possibilities and the need to analyse the
      structure of the ambiguity. Converting the graph to trees could produce
      huge numbers of results and will lose information about sharing etc.
      </para>

      <para>
      One final comment. You may have learnt through using
      <application>yacc</application>-style tools that ambiguous grammars
      are to be avoided, and that ambiguity is something that appears
      only in Natural Language processing.
      This is definitely not true.
      Many interesting grammars are ambiguous, and with GLR tools they
      can be used effectively.
      We hope you enjoy exploring this fascinating area!
      </para>

    </sect1>

    <sect1 id="sec-glr-using">
      <title>Basic use of a Happy-generated GLR parser</title>

      <para>
      This section explains how to generate and to use a GLR parser to
      produce structural results.
      Please check the examples for further information.
      Discussion of semantic issues comes later; see
      <xref linkend="sec-glr-semantics"/>.
      </para>

      <sect2 id="sec-glr-using-intro">
        <title>Overview</title>
        <para>
	The process of generating a GLR parser is broadly the same as
	for standard <application>Happy</application>. You write a grammar
	specification, run <application>Happy</application> on this to
	generate some Haskell code, then compile and link this into your
	program.
        </para>
        <para>
	An alternative to using Happy directly is to use the
	<ulink url="http://www.cs.chalmers.se/~markus/BNFC/">
	<application>BNF Converter</application></ulink> tool by
	Markus Forsberg, Peter Gammie, Michael Pellauer and Aarne Ranta.
	This tool creates an abstract syntax, grammar, pretty-printer
	and other useful items from a single grammar formalism, thus
	it saves a lot of work and improves maintainability.
	The current output of BNFC can be used with GLR mode now
	with just a few small changes, but from January 2005 we expect
	to have a fully-compatible version of BNFC.
        </para>
        <para>
	Most of the features of <application>Happy</application> still
	work, but note the important points below.
        </para>
	<variablelist>
	   <varlistentry>
	     <term>module header</term>
	     <listitem>
	       <para>
	       The GLR parser is generated in TWO files, one for data and
	       one for the driver. This is because the driver code needs
	       to be optimized, but for large parsers with lots of data,
	       optimizing the data tables too causes compilation to be
	       too slow.
	       </para>
	       <para>
	       Given a file <literal>Foo.y</literal>, the file
	       <literal>FooData.hs</literal>, containing the data
	       module, is generated with basic type information, the
	       parser tables, and the header and tail code that was
	       included in the parser specification.  Note that
	       <application>Happy</application> can automatically
	       generate the necessary module declaration statements,
	       if you do not choose to provide one in the grammar
	       file. But, if you do choose to provide the module
	       declaration statement, then the name of the module will
	       be parsed and used as the name of the driver
	       module. The parsed name will also be used to form the
	       name of the data module, but with the string
	       <literal>Data</literal> appended to it. The driver
	       module, which is to be found in the file
	       <literal>Foo.hs</literal>, will not contain any other
	       user-supplied text besides the module name. Do not
	       bother to supply any export declarations in your module
	       declaration statement: they will be ignored and
	       dropped, in favor of the standard export declaration.
	       </para>

	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>export of lexer</term>
	     <listitem>
	       <para>
	       You can declare a lexer (and error token) with the
	       <literal>%lexer</literal> directive as normal, but the
	       generated parser does NOT call this lexer automatically.
	       The action of the directive is only to
	       <emphasis>export</emphasis> the lexer function to the top
	       level. This is because some applications need finer control
	       of the lexing process.
	       </para>
	     </listitem>
	   </varlistentry>

	   <varlistentry>
	     <term>precedence information</term>
	     <listitem>
	       <para>
	       This still works, but note the reasons.
	       The precedence and associativity declarations are used in
	       <application>Happy</application>'s LR table creation to
	       resolve certain conflicts. It does this by retaining the
	       actions implied by the declarations and removing the ones
	       which clash with these.
	       The GLR parser back-end then produces code from these
	       filtered tables, hence the rejected actions are never
	       considered by the GLR parser.
	       </para>
	       <para>
	       Hence, declaring precedence and associativity is still
	       a good thing, since it avoids a certain amount of ambiguity
	       that the user knows how to remove.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>monad directive</term>
	     <listitem>
	       <para>
	       There is some support for monadic parsers.
	       The "tree decoding" mode
	       (see <xref linkend="sec-glr-semantics-tree"/>) can use the
	       information given in the <literal>%monad</literal>
	       declaration to monadify the decoding process.
	       This is explained in more detail in
	       <xref linkend="sec-glr-semantics-tree-monad"/>.
	       </para>
	       <para>
	       <emphasis>Note</emphasis>: the generated parsers don't include
	       Ashley Yakeley's monad context information yet. It is currently
	       just ignored.
	       If this is a problem, email and I'll make the changes required.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>parser name directive</term>
	     <listitem>
	       <para>
	       This has no effect at present. It will probably remain this
	       way: if you want to control names, you could use qualified
	       import.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>type information on non-terminals</term>
	     <listitem>
	       <para>
	       The generation of semantic code relies on type information
	       given in the grammar specification. If you don't give an
	       explicit signature, the type <literal>()</literal> is
	       assumed. If you get type clashes mentioning
	       <literal>()</literal> you may need to add type annotations.
	       Similarly, if you don't supply code for the semantic rule
	       portion, then the value <literal>()</literal> is used.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term><literal>error</literal> symbol in grammars, and recovery
		</term>
	     <listitem>
	       <para>
	       No attempt to implement this yet. Any use of
	       <literal>error</literal> in grammars is thus ignored, and
	       parse errors will eventually mean a parse will fail.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>the token type</term>
	     <listitem>
	       <para>
	       The type used for tokens <emphasis>must</emphasis> be in
	       the <literal>Ord</literal> type class (and hence in
	       <literal>Eq</literal>), plus it is recommended that they
	       are in the <literal>Show</literal> class too.
	       The ordering is required for the implementation of
	       ambiguity packing.
	       It may be possible to relax this requirement, but it
	       is probably simpler just to require instances of the type
	       classes. Please tell us if this is a problem.
	       </para>
	     </listitem>
	   </varlistentry>
	</variablelist>

      </sect2>

      <sect2 id="sec-glr-using-main">
        <title>The main function</title>
        <para>
	The driver file exports a function
	<literal>doParse :: [[UserDefTok]] -> GLRResult</literal>.
	If you are using several parsers, use qualified naming to
	distinguish them.
	<literal>UserDefTok</literal> is a synonym for the type declared with
	the <literal>%tokentype</literal> directive.
        </para>
      </sect2>

      <sect2 id="sec-glr-using-input">
        <title>The input</title>
        <para>
	The input to <literal>doParse</literal> is a list of
	<emphasis>list of</emphasis> token values.
	The outer level represents the sequence of input symbols, and
	the inner list represents ambiguity in the tokenisation of each
	input symbol.
	For example, the word "run" can be at least a noun or a verb,
	hence the inner list will contain at least two values.
	If your tokens are not ambiguous, you will need to convert each
	token to a singleton list before parsing.
        </para>
      </sect2>

      <sect2 id="sec-glr-using-output">
        <title>The Parse Result</title>
        <para>
	The parse result is expressed with the following types.
	A successful parse yields a forest (explained below) and a single
	root node for the forest.
	A parse may fail for one of two reasons: running out of input or
	a (global) parse error. A global parse error means that it was
	not possible to continue parsing <emphasis>any</emphasis> of the
	live alternatives; this is different from a local error, which simply
	means that the current alternative dies and we try some other
	alternative. In both error cases, the forest at failure point is
	returned, since it may contain useful information.
	Unconsumed tokens are returned when there is a global parse error.
        </para>
<programlisting>
type ForestId = (Int,Int,GSymbol)
data GSymbol  = &lt;... automatically generated ...&gt;
type Forest   = FiniteMap ForestId [Branch]
type RootNode = ForestId
type Tokens   = [[(Int, GSymbol)]]
data Branch   = Branch {b_sem :: GSem, b_nodes :: [ForestId]}
data GSem     = &lt;... automatically generated ...&gt;

data GLRResult
  = ParseOK     RootNode Forest    -- forest with root
  | ParseError  Tokens   Forest    -- partial forest with bad input
  | ParseEOF             Forest    -- partial forest (missing input)
</programlisting>
	<para>
	Conceptually, the parse forest is a directed, acyclic and-or
	graph. It is represented by a mapping of <literal>ForestId</literal>s
	to lists of possible analyses. The <literal>FiniteMap</literal>
	type is used to provide efficient and convenient access.
	The <literal>ForestId</literal> type identifies nodes in the
	graph, named by the range of input they span and the category of
	analysis they license. <literal>GSymbol</literal> is generated
	automatically as a union of the names of grammar rules (prefixed
	by <literal>G_</literal> to avoid name clashes) and of tokens and
	an EOF symbol. Tokens are wrapped in the constructor
	<literal>HappyTok :: UserDefTok -> GSymbol</literal>.
	</para>
	<para>
	The <literal>Branch</literal> type represents a match for some
	right-hand side of a production, containing semantic information
	(see below)
	and a list of sub-analyses. Each of these is a node in the graph.
	Note that tokens are represented as childless nodes that span
	one input position. Empty productions will appear as childless nodes
	that start and end at the same position.
	</para>
      </sect2>

      <sect2 id="sec-glr-using-compiling">
        <title>Compiling the parser</title>
        <para>
	<application>Happy</application> will generate two files, and these
	should be compiled as normal Haskell files.
	If speed is an issue, then you should use the <option>-O</option>
	flags etc with the driver code, and if feasible, with the parser
	tables too.
        </para>
        <para>
        You can also use the <option>--ghc</option> flag to trigger certain
	<application>GHC</application>-specific optimizations. At present,
	this just causes use of unboxed types in the tables and in some key
	code.
	Using this flag causes relevant <application>GHC</application>
	option pragmas to be inserted into the generated code, so you shouldn't
	have to use any strange flags (unless you want to...).
        </para>
      </sect2>
    </sect1>

    <sect1 id="sec-glr-semantics">
      <title>Including semantic results</title>

      <para>
      This section discusses the options for including semantic information
      in grammars.
      </para>

      <sect2 id="sec-glr-semantics-intro">
        <title>Forms of semantics</title>
        <para>
	Semantic information may be attached to productions in the
	conventional way, but when more than one analysis is possible,
	the use of the semantic information must change.
	Two schemes have been implemented, which we call
	<emphasis>tree decoding</emphasis>
	and <emphasis>label decoding</emphasis>.
	The former is for simple applications, where there is not much
	ambiguity and hence where the effective unpacking of the parse
	forest isn't a factor. This mode is quite similar to the
	standard mode in <application>Happy</application>.
	The latter is for serious applications, where sharing is important
	and where processing of the forest (eg filtering) is needed.
	Here, the emphasis is about providing rich labels in nodes of the
	the parse forest, to support such processing.
        </para>
	<para>
	The default mode is labelling. If you want the tree decode mode,
	use the <option>--decode</option> flag.
	</para>
      </sect2>

      <sect2 id="sec-glr-semantics-tree">
        <title>Tree decoding</title>
        <para>
	Tree decoding corresponds to unpacking the parse forest to individual
	trees and collecting the list of semantic results computed from
	each of these. It is a mode intended for simple applications,
	where there is limited ambiguity.
	You may access semantic results from components of a reduction
	using the dollar variables.
	As a working example, the following is taken from the
	<literal>expr-tree</literal> grammar in the examples.
	Note that the type signature is required, else the types in use
	can't be determined by the parser generator.
        </para>
<programlisting>
E :: {Int} -- type signature needed
  : E '+' E  { $1 + $3 }
  | E '*' E  { $1 * $3 }
  | i        { $1 }
</programlisting>
	<para>
	This mode works by converting each of the semantic rules into
	functions (abstracted over the dollar variables mentioned),
	and labelling each <literal>Branch</literal> created from a
	reduction of that rule with the function value.
	This amounts to <emphasis>delaying</emphasis> the action of the
	rule, since we must wait until we know the results of all of
	the sub-analyses before computing any of the results. (Certain
	cases of packing can add new analyses at a later stage.)
	</para>
	<para>
	At the end of parsing, the functions are applied across relevant
	sub-analyses via a recursive descent. The main interface to this
	is via the class and entry function below. Typically,
	<literal>decode</literal> should be called on the root of the
	forest, also supplying a function which maps node names to their
	list of analyses (typically a partial application of lookup in
	the forest value).
	The result is a list of semantic values.
	Note that the context of the call to <literal>decode</literal>
	should (eventually) supply a concrete type to allow selection
	of appropriate instance. Ie, you have to indicate in some way
	what type the semantic result should have.
	<literal>Decode_Result a</literal> is a synonym generated by
	<application>Happy</application>: for non-monadic semantics,
	it is equivalent to <literal>a</literal>; when monads are
	in use, it becomes the declared monad type.
	See the full <literal>expr-eval</literal> example for more
	information.
	</para>
<programlisting>
class TreeDecode a where
        decode_b :: (ForestId -> [Branch]) -> Branch -> [Decode_Result a]
decode :: TreeDecode a => (ForestId -> [Branch]) -> ForestId -> [Decode_Result a]
</programlisting>

	<para>
	The GLR parser generator identifies the types involved in each
	semantic rule, hence the types of the functions, then creates
	a union containing distinct types. Values of this union are
	stored in the branches. (The union is actually a bit more complex:
	it must also distinguish patterns of dollar-variable usage, eg
	a function <literal>\x y -> x + y </literal> could be applied to
	the first and second constituents, or to the first and third.)
	The parser generator also creates instances of the
	<literal>TreeDecode</literal> class, which unpacks the semantic
	function and applies it across the decodings of the possible
	combinations of children. Effectively, it does a cartesian product
	operation across the lists of semantic results from each of the
	children. Eg <literal>[1,2] "+" [3,4]</literal> produces
	<literal>[4,5,5,6]</literal>.
	Information is extracted from token values using the patterns
	supplied by the user when declaring tokens and their Haskell
	representation, so the dollar-dollar convention works also.
	</para>
	<para>
	The decoding process could be made more efficient by using
	memoisation techniques, but this hasn't been implemented since
	we believe the other (label) decoding mode is more useful. (If someone
	sends in a patch, we may include it in a future release -- but this
	might be tricky, eg require higher-order polymorphism?
	Plus, are there other ways of using this form of semantic function?)
	</para>
      </sect2>

      <sect2 id="sec-glr-semantics-label">
        <title>Label decoding</title>
        <para>
	The labelling mode aims to label branches in the forest with
	information that supports subsequent processing, for example
	the filtering and prioritisation of analyses prior to extraction
	of favoured solutions. As above, code fragments are given in
	braces and can contain dollar-variables. But these variables
	are expanded to node names in the graph, with the intention of
	easing navigation.
	The following grammar is from the <literal>expr-tree</literal>
	example.
        </para>
<programlisting>
E :: {Tree ForestId Int}
  : E '+' E      { Plus  $1 $3 }
  | E '*' E      { Times $1 $3 }
  | i            { Const $1 }
</programlisting>

        <para>
	Here, the semantic values provide more meaningful labels than
	the plain structural information. In particular, only the
	interesting parts of the branch are represented, and the
	programmer can clearly select or label the useful constituents
	if required. There is no need to remember that it is the first
	and third child in the branch which we need to extract, because
	the label only contains those values (the `noise' has been dropped).
	Consider also the difference between concrete and abstract syntax.
	The labels are oriented towards abstract syntax.
	Tokens are handled slightly differently here: when they appear
	as children in a reduction, their informational content can
	be extracted directly, hence the <literal>Const</literal> value
	above will be built with the <literal>Int</literal> value from
	the token, not some <literal>ForestId</literal>.
        </para>

        <para>
	Note the useful technique of making the label types polymorphic
	in the position used for forest indices. This allows replacement
	at a later stage with more appropriate values, eg. inserting
	lists of actual subtrees from the final decoding.
        </para>
	<para>
	Use of these labels is supported by a type class
	<literal>LabelDecode</literal>, which unpacks values of the
	automatically-generated union type <literal>GSem</literal>
	to the original type(s). The parser generator will create
	appropriate instances of this class, based on the type information
	in the grammar file. (Note that omitting type information leads
	to a default of <literal>()</literal>.)
	Observe that use of the labels is often like traversing an abstract
	syntax, and the structure of the abstract syntax type usually
	constrains the types of constituents; so once the overall type
	is fixed (eg. with a type cast or signature) then there are no
	problems with resolution of class instances.
	</para>

<programlisting>
class LabelDecode a where
        unpack :: GSem -> a
</programlisting>

        <para>
	Internally, the semantic values are packed in a union type as
	before, but there is no direct abstraction step. Instead, the
	<literal>ForestId</literal> values (from the dollar-variables)
	are bound when the corresponding branch is created from the
	list of constituent nodes. At this stage, token information
	is also extracted, using the patterns supplied by the user
	when declaring the tokens.
        </para>
      </sect2>

      <sect2 id="sec-glr-semantics-tree-monad">
        <title>Monadic tree decoding</title>
        <para>
	You can use the <literal>%monad</literal> directive in the
	tree-decode mode.
	Essentially, the decoding process now creates a list of monadic
	values, using the monad type declared in the directive.
	The default handling of the semantic functions is to apply the
	relevant <literal>return</literal> function to the value being
	returned. You can over-ride this using the <literal>{% ... }</literal>
	convention. The declared <literal>(>>=)</literal> function is
	used to assemble the computations.
	</para>
	<para>
	Note that no attempt is made to share the results of monadic
	computations from sub-trees. (You could possibly do this by
	supplying a memoising lookup function for the decoding process.)
	Hence, the usual behaviour is that decoding produces whole
	monadic computations, each part of which is computed afresh
	(in depth-first order) when the whole is computed.
	Hence you should take care to initialise any relevant state
	before computing the results from multiple solutions.
	</para>
	<para>
	This facility is experimental, and we welcome comments or
	observations on the approach taken!
	An example is provided (<literal>examples/glr/expr-monad</literal>).
	It is the standard example of arithmetic expressions, except that
	the <literal>IO</literal> monad is used, and a user exception is
	thrown when the second argument to addition is an odd number.
	Running this example will show a zero (from the exception handler)
	instead of the expected number amongst the results from the other
	parses.
	</para>
      </sect2>
    </sect1>

    <sect1 id="sec-glr-misc">
      <title>Further information</title>

      <para>
      Other useful information...
      </para>

      <sect2 id="sec-glr-misc-examples">
        <title>The GLR examples</title>
        <para>
	The directory <literal>examples/glr</literal> contains several examples
	from the small to the large. Please consult these or use them as a
	base for your experiments.
        </para>
      </sect2>

      <sect2 id="sec-glr-misc-graphs">
        <title>Viewing forests as graphs</title>
        <para>
	If you run the examples with <application>GHC</application>, each
	run will produce a file <literal>out.daVinci</literal>. This is a
	graph in the format expected by the <emphasis>daVinci</emphasis>
	graph visualization tool.
	(See <ulink url="http://www.informatik.uni-bremen.de/~davinci/"/>
	for more information. Educational use licenses are currently
	available without charge.)
        </para>
	<para>
	We highly recommend looking at graphs of parse results - it really
	helps to understand the results.
	The graphs files are created with Sven Panne's library for
	communicating with <emphasis>daVinci</emphasis>, supplemented
	with some extensions due to Callaghan. Copies of this code are
	included in the examples directory, for convenience.
	If you are trying to view large and complex graphs, contact Paul
	Callaghan (there are tools and techniques to make the graphs more
	manageable).
	</para>
      </sect2>

      <sect2 id="sec-glr-misc-applications">
        <title>Some Applications of GLR parsing</title>
        <para>
	GLR parsing (and related techniques) aren't just for badly written
	grammars or for things like natural language (NL) where ambiguity is
	inescapable. There are applications where ambiguity can represent
	possible alternatives in pattern-matching tasks, and the flexibility
	of these parsing techniques and the resulting graphs support deep
	analyses. Below, we briefly discuss some examples, a mixture from
	our recent work and from the literature.
        </para>

	<variablelist>
	   <varlistentry>
	     <term>Gene sequence analysis</term>
	     <listitem>
	       <para>
	       Combinations of structures within gene sequences can be
	       expressed as a grammar, for example a "start" combination
	       followed by a "promoter" combination then the gene proper.
	       A recent undergraduate project has used this GLR implementation
	       to detect candiate matches in data, and then to filter these
	       matches with a mixture of local and global information.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>Rhythmic structure in poetry</term>
	     <listitem>
	       <para>
	       Rhythmic patterns in (English) poetry obey certain rules,
	       and in more modern poetry can break rules in particular ways
	       to achieve certain effects. The standard rhythmic patterns
	       (eg. iambic pentameter) can be encoded as a grammar, and
	       deviations from the patterns also encoded as rules.
	       The neutral reading can be parsed with this grammar, to
	       give a forest of alternative matches. The forest can be
	       analysed to give a preferred reading, and to highlight
	       certain technical features of the poetry.
	       An undergraduate project in Durham has used this implementation
	       for this purpose, with promising results.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>Compilers -- instruction selection</term>
	     <listitem>
	       <para>
	       Recent work has phrased the translation problem in
	       compilers from intermediate representation to an
	       instruction set for a given processor as a matching
	       problem. Different constructs at the intermediate
	       level can map to several combinations of machine
	       instructions. This knowledge can be expressed as a
	       grammar, and instances of the problem solved by
	       parsing. The parse forest represents competing solutions,
	       and allows selection of optimum solutions according
	       to various measures.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>Robust parsing of ill-formed input</term>
	     <listitem>
	       <para>
	       The extra flexibility of GLR parsing can simplify parsing
	       of formal languages where a degree of `informality' is allowed.
	       For example, Html parsing. Modern browsers contain complex
	       parsers which are designed to try to extract useful information
	       from Html text which doesn't follow the rules precisely,
	       eg missing start tags or missing end tags.
	       Html with missing tags can be written as an ambiguous grammar,
	       and it should be a simple matter to extract a usable
	       interpretation from a forest of parses.
	       Notice the technique: we widen the scope of the grammar,
	       parse with GLR, then extract a reasonable solution.
	       This is arguably simpler than pushing an LR(1) or LL(1)
	       parser past its limits, and also more maintainable.
	       </para>
	     </listitem>
	   </varlistentry>
	   <varlistentry>
	     <term>Natural Language Processing</term>
	     <listitem>
	       <para>
	       Ambiguity is inescapable in the syntax of most human languages.
	       In realistic systems, parse forests are useful to encode
	       competing analyses in an efficient way, and they also provide
	       a framework for further analysis and disambiguation. Note
	       that ambiguity can have many forms, from simple phrase
	       attachment uncertainty to more subtle forms involving mixtures
	       of word senses. If some degree of ungrammaticality is to be
	       tolerated in a system, which can be done by extending the
	       grammar with productions incorporating common forms of
	       infelicity, the degree of ambiguity increases further. For
	       systems used on arbitrary text, such as on newspapers,
	       it is not uncommon that many sentences permit several
	       hundred or more analyses. With such grammars, parse forest
	       techniques are essential.
	       Many recent NLP systems use such techniques, including
	       the Durham's earlier LOLITA system - which was mostly
	       written in Haskell.
	       </para>
	     </listitem>
	   </varlistentry>
	</variablelist>
      </sect2>

      <sect2 id="sec-glr-misc-workings">
        <title>Technical details</title>
        <para>
	The original implementation was developed by Ben Medlock,
	as his undergraduate final year project,
	using ideas from Peter Ljungloef's Licentiate thesis
	(see <ulink url="http://www.cs.chalmers.se/~peb/parsing"/>, and
	we recommend the thesis for its clear analysis of parsing
	algorithms).
	Ljungloef's version produces lists of parse trees, but Medlock
	adapted this to produce an explicit graph containing parse structure
	information. He also incorporated
	the code into <application>Happy</application>.
        </para>

        <para>
	After Medlock's graduation, Callaghan extended the code to
	incorporate semantic information, and made several improvements
	to the original code, such as improved local packing and
	support for hidden left recursion. The performance of the
	code was significantly improved, after changes of representation
	(eg to a chart-style data structure)
	and technique. Medlock's code was also used in several student
	projects, including analysis of gene sequences (Fischer) and
	analysis of rhythmic patterns in poetry (Henderson).
        </para>

        <para>
	The current code implements the standard GLR algorithm extended
	to handle hidden left recursion. Such recursion, as in the grammar
	below from Rekers [1992], causes the standard algorithm to loop
	because the empty reduction <literal>A -> </literal> is always
	possible and the LR parser will not change state. Alternatively,
	there is a problem because an unknown (at the start of parsing)
	number of <literal>A</literal>
	items are required, to match the number of <literal>i</literal>
	tokens in the input.
        </para>
<programlisting>
S -> A Q i | +
A ->
</programlisting>
	<para>
	The solution to this is not surprising. Problematic recursions
	are detected as zero-span reductions in a state which has a
	<literal>goto</literal> table entry looping to itself. A special
	symbol is pushed to the stack on the first such reduction,
	and such reductions are done at most once for any token
	alternative for any input position.
	When popping from the stack, if the last token being popped
	is such a special symbol, then two stack tails are returned: one
	corresponding to a conventional pop (which removes the
	symbol) and the other to a duplication of the special symbol
	(the stack is not changed, but a copy of the symbol is returned).
	This allows sufficient copies of the empty symbol to appear
	on some stack, hence allowing the parse to complete.
	</para>

	<para>
	The forest is held in a chart-style data structure, and this supports
	local ambiguity packing (chart parsing is discussed in Ljungloef's
	thesis, among other places).
	A limited amount of packing of live stacks is also done, to avoid
	some repetition of work.
	</para>

        <para>
	[Rekers 1992] Parser Generation for Interactive Environments,
	PhD thesis, University of Amsterdam, 1992.
        </para>
      </sect2>

      <sect2 id="sec-glr-misc-filter">
        <title>The <option>--filter</option> option</title>
        <para>
	You might have noticed this GLR-related option. It is an experimental
	feature intended to restrict the amount of structure retained in the
	forest by discarding everything not required for the semantic
	results. It may or it may not work, and may be fixed in a future
	release.
        </para>
      </sect2>

      <sect2 id="sec-glr-misc-limitations">
        <title>Limitations and future work</title>
	<para>
	The parser supports hidden left recursion, but makes no attempt
	to handle cyclic grammars that have rules which do not consume any
	input. If you have a grammar like this, for example with rules like
	<literal>S -> S</literal> or
	<literal>S -> A S | x; A -> empty</literal>, the implementation will
	loop until you run out of stack - but if it will happen, it often
	happens quite quickly!
	</para>
        <para>
	The code has been used and tested frequently over the past few years,
	including being used in several undergraduate projects. It should be
	fairly stable, but as usual, can't be guaranteed bug-free. One day
	I will write it in Epigram!
	</para>
        <para>
	If you have suggestions for improvements, or requests for features,
	please contact Paul
	Callaghan. There are some changes I am considering, and some
	views and/or encouragement from users will be much appreciated.
	Further information can be found on Callaghan's
	<ulink url="http://www.dur.ac.uk/p.c.callaghan/happy-glr">GLR parser
	page</ulink>.
        </para>
      </sect2>

      <sect2 id="sec-glr-misc-acknowledgements">
        <title>Thanks and acknowledgements</title>
        <para>
	Many thanks to the people who have used and tested this software
	in its various forms, including Julia Fischer, James Henderson, and
	Aarne Ranta.
        </para>
      </sect2>
    </sect1>
  </chapter>

<!-- Attribute Grammars ================================================= -->
  <chapter id="sec-AttributeGrammar">
    <title>Attribute Grammars</title>

    <sect1 id="sec-introAttributeGrammars">
    <title>Introduction</title>

    <para>Attribute grammars are a formalism for expressing syntax directed
    translation of a context-free grammar.  An introduction to attribute grammars
    may be found <ulink
    url="http://www-rocq.inria.fr/oscar/www/fnc2/manual/node32.html">here</ulink>.
    There is also an article in the Monad Reader about attribute grammars and a
    different approach to attribute grammars using Haskell
    <ulink url="http://www.haskell.org/haskellwiki/The_Monad.Reader/Issue4/Why_Attribute_Grammars_Matter">here</ulink>.
    </para>

    <para>
    The main practical difficulty that has prevented attribute grammars from
    gaining widespread use involves evaluating the attributes.  Attribute grammars
    generate non-trivial data dependency graphs that are difficult to evaluate
    using mainstream languages and techniques.  The solutions generally involve
    restricting the form of the grammars or using big hammers like topological sorts.
    However, a language which supports lazy evaluation, such as Haskell, has no
    problem forming complex data dependency graphs and evaluating them.  The primary
    intellectual barrier to attribute grammar adoption seems to stem from the fact that
    most programmers have difficulty with the declarative nature of the
    specification.  Haskell programmers, on the other hand, have already
    embraced a purely functional language.  In short, the Haskell language and
    community seem like a perfect place to experiment with attribute grammars.
    </para>

    <para>
    Embedding attribute grammars in Happy is easy because because Haskell supports
    three important features: higher order functions, labeled records, and
    lazy evaluation.  Attributes are encoded as fields in a labeled record. The parse
    result of each non-terminal in the grammar is a function which takes a record
    of inherited attributes and returns a record of synthesized attributes.  In each
    production, the attributes of various non-terminals are bound together using
    <literal>let</literal>.
    Finally, at the end of the parse, a distinguished attribute is evaluated to be
    the final result.  Lazy evaluation takes care of evaluating each attribute in the
    correct order, resulting in an attribute grammar system that is capable of evaluating
    a fairly large class of attribute grammars.
    </para>

    <para>
    Attribute grammars in Happy do not use any language extensions, so the
    parsers are Haskell 98 (assuming you don't use the GHC specific -g option).
    Currently, attribute grammars cannot be generated for GLR parsers (It's not
    exactly clear how these features should interact...)
    </para>

    </sect1>

    <sect1 id="sec-AtrributeGrammarsInHappy">
    <title>Attribute Grammars in Happy</title>

    <sect2 id="sec-declaringAttributes">
      <title>Declaring Attributes</title>

      <para>
      The presence of one or more <literal>%attribute</literal> directives indicates
      that a grammar is an attribute grammar.  Attributes are calculated properties
      that are associated with the non-terminals in a parse tree.  Each
      <literal>%attribute</literal> directive generates a field in the attributes
      record with the given name and type.
      </para>

      <para>
      The first <literal>%attribute</literal>
      directive in a grammar defines the default attribute.  The
      default attribute is distinguished in two ways: 1) if no attribute specifier is
      given on an attribute reference,
      the default attribute is assumed (see <xref linkend="sec-semanticRules"/>)
      and 2) the value for the default attribute of the starting non-terminal becomes the
      return value of the parse.
      </para>

      <para>
      Optionally, one may specify a type declaration for the attribute record using
      the <literal>%attributetype</literal> declaration.  This allows you to define the
      type given to the attribute record and, more importantly, allows you to introduce
      type variables that can be subsequently used in <literal>%attribute</literal>
      declarations.  If the <literal>%attributetype</literal> directive is given without
      any <literal>%attribute</literal> declarations, then the <literal>%attributetype</literal>
      declaration has no effect.
      </para>

      <para>
      For example, the following declarations:
      </para>

<programlisting>
%attributetype { MyAttributes a }
%attribute value { a }
%attribute num   { Int }
%attribute label { String }
</programlisting>

      <para>
      would generate this attribute record declaration in the parser:
      </para>

<programlisting>
data MyAttributes a =
   HappyAttributes {
     value :: a,
     num :: Int,
     label :: String
   }
</programlisting>

       <para>
       and <literal>value</literal> would be the default attribute.
       </para>

    </sect2>

    <sect2 id="sec-semanticRules">
      <title>Semantic Rules</title>

      <para>In an ordinary Happy grammar, a production consists of a list
      of terminals and/or non-terminals followed by an uninterpreted
      code fragment enclosed in braces.  With an attribute grammar, the
      format is very similar, but the braces enclose a set of semantic rules
      rather than uninterpreted Haskell code.  Each semantic rule is either
      an attribute calculation or a conditional, and rules are separated by
      semicolons<footnote><para>Note that semantic rules must not rely on
      layout, because whitespace alignment is not guaranteed to be
      preserved</para></footnote>.
      </para>

      <para>
      Both attribute calculations and conditionals may contain attribute references
      and/or terminal references.  Just like regular Happy grammars, the tokens
      <literal>$1</literal> through <literal>$&lt;n&gt;</literal>, where
      <literal>n</literal> is the number of symbols in the production, refer to
      subtrees of the parse.  If the referenced symbol is a terminal, then the
      value of the reference is just the value of the terminal, the same way as
      in a regular Happy grammar.  If the referenced symbol is a non-terminal,
      then the reference may be followed by an attribute specifier, which is
      a dot followed by an attribute name.  If the attribute specifier is omitted,
      then the default attribute is assumed (the default attribute is the first
      attribute appearing in an <literal>%attribute</literal> declaration).
      The special reference <literal>$$</literal> references the
      attributes of the current node in the parse tree; it behaves exactly
      like the numbered references.  Additionally, the reference <literal>$></literal>
      always references the rightmost symbol in the production.
      </para>

      <para>
      An attribute calculation rule is of the form:
      </para>
<programlisting>
&lt;attribute reference&gt; = &lt;Haskell expression&gt;
</programlisting>
      <para>
      A rule of this form defines the value of an attribute, possibly as a function
      of the attributes of <literal>$$</literal> (inherited attributes), the attributes
      of non-terminals in the production (synthesized attributes), or the values of
      terminals in the production.  The value for an attribute can only
      be defined once for a particular production.
      </para>

      <para>
      The following rule calculates the default attribute of the current production in
      terms of the first and second items of the production (a synthesized attribute):
      </para>
<programlisting>
$$ = $1 : $2
</programlisting>

      <para>
      This rule calculates the length attribute of a non-terminal in terms of the
      length of the current non-terminal (an inherited attribute):
      </para>
<programlisting>
$1.length = $$.length + 1
</programlisting>

      <para>
      Conditional rules allow the rejection of strings due to context-sensitive properties.
      All conditional rules have the form:
      </para>
<programlisting>
where &lt;Haskell expression&gt;
</programlisting>
      <para>
      For non-monadic parsers, all conditional expressions
      must be of the same (monomorphic) type.  At
      the end of the parse, the conditionals will be reduced using
      <literal>seq</literal>, which gives the grammar an opportunity to call
      <literal>error</literal> with an informative message.  For monadic parsers,
      all conditional statements must have type <literal>Monad m => m ()</literal> where
      <literal>m</literal> is the monad in which the parser operates.  All conditionals
      will be sequenced at the end of the parse, which allows the conditionals to call
      <literal>fail</literal> with an informative message.
      </para>

      <para>
      The following conditional rule will cause the (non-monadic) parser to fail
      if the inherited length attribute is not 0.
      </para>
<programlisting>
where if $$.length == 0 then () else error "length not equal to 0"
</programlisting>

      <para>
      This conditional is the monadic equivalent:
      </para>
<programlisting>
where unless ($$.length == 0) (fail "length not equal to 0")
</programlisting>


    </sect2>
    </sect1>

    <sect1 id="sec-AttrGrammarLimits">
      <title>Limits of Happy Attribute Grammars</title>

      <para>
	If you are not careful, you can write an attribute grammar which fails to
	terminate.  This generally happens when semantic rules
	are written which cause a circular dependency on the value of
	an attribute.  Even if the value of the attribute is well-defined (that is,
	if a fixpoint calculation over attribute values will eventually converge to
	a unique solution), this attribute grammar system will not evaluate such
	grammars.
      </para>
      <para>
	One practical way to overcome this limitation is to ensure that each attribute
	is always used in either a top-down (inherited) fashion or in a bottom-up
	(synthesized) fashion.  If the calculations are sufficiently lazy, one can
	"tie the knot" by synthesizing a value in one attribute, and then assigning
	that value to another, inherited attribute at some point in the parse tree.
	This technique can be useful for common tasks like building symbol tables for
	a syntactic scope and making that table available to sub-nodes of the parse.
      </para>
    </sect1>


    <sect1 id="sec-AttributeGrammarExample">
      <title>Example Attribute Grammars</title>
      <para>
      The following two toy attribute grammars may prove instructive.  The first is
      an attribute grammar for the classic context-sensitive grammar
      { a^n b^n c^n | n >= 0 }.  It demonstrates the use of conditionals,
      inherited and synthesized attributes.
      </para>

<programlisting>
{
module ABCParser (parse) where
}

%tokentype { Char }

%token a { 'a' }
%token b { 'b' }
%token c { 'c' }
%token newline { '\n' }

%attributetype { Attrs a }
%attribute value { a }
%attribute len   { Int }

%name parse abcstring

%%

abcstring
   : alist blist clist newline
        { $$ = $1 ++ $2 ++ $3
        ; $2.len = $1.len
        ; $3.len = $1.len
        }

alist
   : a alist
        { $$ = $1 : $2
        ; $$.len = $2.len + 1
        }
   |    { $$ = []; $$.len = 0 }

blist
   : b blist
        { $$ = $1 : $2
        ; $2.len = $$.len - 1
        }
   |    { $$ = []
        ; where failUnless ($$.len == 0) "blist wrong length"
        }

clist
   : c clist
        { $$ = $1 : $2
        ; $2.len = $$.len - 1
        }
   |    { $$ = []
        ; where failUnless ($$.len == 0) "clist wrong length"
        }

{
happyError = error "parse error"
failUnless b msg = if b then () else error msg
}
</programlisting>

<para>
This grammar parses binary numbers and
calculates their value.  It demonstrates
the use of inherited and synthesized attributes.
</para>


<programlisting>
{
module BitsParser (parse) where
}

%tokentype { Char }

%token minus { '-' }
%token plus  { '+' }
%token one   { '1' }
%token zero  { '0' }
%token newline { '\n' }

%attributetype { Attrs }
%attribute value { Integer }
%attribute pos   { Int }

%name parse start

%%

start
   : num newline { $$ = $1 }

num
   : bits        { $$ = $1       ; $1.pos = 0 }
   | plus bits   { $$ = $2       ; $2.pos = 0 }
   | minus bits  { $$ = negate $2; $2.pos = 0 }

bits
   : bit         { $$ = $1
                 ; $1.pos = $$.pos
                 }

   | bits bit    { $$ = $1 + $2
                 ; $1.pos = $$.pos + 1
                 ; $2.pos = $$.pos
                 }

bit
   : zero        { $$ = 0 }
   | one         { $$ = 2^($$.pos) }

{
happyError = error "parse error"
}
</programlisting>


    </sect1>

  </chapter>

<!-- Invoking ============================================================ -->

  <chapter id="sec-invoking">
    <title>Invoking <application>Happy</application></title>

    <para>An invocation of <application>Happy</application> has the following syntax:</para>

<screen>$ happy [ <emphasis>options</emphasis> ] <emphasis>filename</emphasis> [ <emphasis>options</emphasis> ]</screen>

    <para>All the command line options are optional (!) and may occur
    either before or after the input file name. Options that take
    arguments may be given multiple times, and the last occurrence
    will be the value used.</para>

    <para>There are two types of grammar files,
    <filename>file.y</filename> and <filename>file.ly</filename>, with
    the latter observing the reverse comment (or literate) convention
    (i.e. each code line must begin with the character
    <literal>&gt;</literal>, lines which don't begin with
    <literal>&gt;</literal> are treated as comments).  The examples
    distributed with <application>Happy</application> are all of the
    .ly form.</para>
    <indexterm>
      <primary>literate grammar files</primary>
    </indexterm>

    <para>The flags accepted by <application>Happy</application> are as follows:</para>

    <variablelist>

      <varlistentry>
	<term><option>-o</option> <replaceable>file</replaceable></term>
	<term><option>--outfile</option>=<replaceable>file</replaceable></term>
	<listitem>
	  <para>Specifies the destination of the generated parser module.
	  If omitted, the parser will be placed in
          <replaceable>file</replaceable><literal>.hs</literal>,
	  where <replaceable>file</replaceable> is the name of the input
          file with any extension removed.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-i</option><optional><replaceable>file</replaceable></optional></term>
	<term><option>--info</option><optional>=<replaceable>file</replaceable></optional></term>
	<listitem>
	  <indexterm>
	    <primary>info file</primary>
	  </indexterm>
	  <para> Directs <application>Happy</application> to produce an info file
          containing detailed information about the grammar, parser
          states, parser actions, and conflicts.  Info files are vital
          during the debugging of grammars.  The filename argument is
          optional (note that there's no space between
          <literal>-i</literal> and the filename in the short
          version), and if omitted the info file will be written to
          <replaceable>file</replaceable><literal>.info</literal> (where
          <replaceable>file</replaceable> is the input file name with any
          extension removed).</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-p</option><optional><replaceable>file</replaceable></optional></term>
	<term><option>--pretty</option><optional>=<replaceable>file</replaceable></optional></term>
	<listitem>
	  <indexterm>
	    <primary>pretty print</primary>
	  </indexterm>
	  <para> Directs <application>Happy</application> to produce a file
          containing a pretty-printed form of the grammar, containing only
          the productions, withouth any semantic actions or type signatures.
          If no file name is provided, then the file name will be computed
          by replacing the extension of the input file with
          <literal>.grammar</literal>.
          <para>
	</listitem>
      </varlistentry>



      <varlistentry>
	<term><option>-t</option> <replaceable>dir</replaceable></term>
	<term><option>--template</option>=<replaceable>dir</replaceable></term>
	<listitem>
	  <indexterm>
	    <primary>template files</primary>
	  </indexterm>
	  <para>Instructs <application>Happy</application> to use this directory
          when looking for template files: these files contain the
          static code that <application>Happy</application> includes in every
          generated parser.  You shouldn't need to use this option if
          <application>Happy</application> is properly configured for your
          computer.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-m</option> <replaceable>name</replaceable></term>
	<term><option>--magic-name</option>=<replaceable>name</replaceable></term>
	<listitem>
	  <para> <application>Happy</application> prefixes all the symbols it uses internally
          with either <literal>happy</literal> or <literal>Happy</literal>.  To use a
          different string, for example if the use of <literal>happy</literal>
          is conflicting with one of your own functions, specify the
          prefix using the <option>-m</option> option.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-s</option></term>
	<term><option>--strict</option></term>
	<listitem>
	  <para>NOTE: the <option>--strict</option> option is
	  experimental and may cause unpredictable results.</para>

	  <para>This option causes the right hand side of each
	  production (the semantic value) to be evaluated eagerly at
	  the moment the production is reduced.  If the lazy behaviour
	  is not required, then using this option will improve
	  performance and may reduce space leaks.  Note that the
	  parser as a whole is never lazy - the whole input will
	  always be consumed before any input is produced, regardless
	  of the setting of the <option>--strict</option> flag.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-g</option></term>
	<term><option>--ghc</option></term>
	<listitem>
	  <indexterm>
	    <primary>GHC</primary>
	  </indexterm>
	  <indexterm>
	    <primary>back-ends</primary>
	    <secondary>GHC</secondary>
	  </indexterm>
	  <para>Instructs <application>Happy</application> to generate a parser
	  that uses GHC-specific extensions to obtain faster code.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-c</option></term>
	<term><option>--coerce</option></term>
	<listitem>
	  <indexterm>
	    <primary>coerce</primary>
	  </indexterm>
	  <indexterm>
	    <primary>back-ends</primary>
	    <secondary>coerce</secondary>
	  </indexterm>
	  <para> Use GHC's <literal>unsafeCoerce#</literal> extension to
          generate smaller faster parsers.  Type-safety isn't
          compromised.</para>

	  <para>This option may only be used in conjunction with
          <option>-g</option>.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-a</option></term>
	<term><option>--arrays</option></term>
	<listitem>
	  <indexterm>
	    <primary>arrays</primary>
	  </indexterm>
	  <indexterm>
	    <primary>back-ends</primary>
	    <secondary>arrays</secondary>
	  </indexterm>
	  <para> Instructs <application>Happy</application> to generate a parser
          using an array-based shift reduce parser.  When used in
          conjunction with <option>-g</option>, the arrays will be
          encoded as strings, resulting in faster parsers.  Without
          <option>-g</option>, standard Haskell arrays will be
          used.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-d</option></term>
	<term><option>--debug</option></term>
	<listitem>
	  <indexterm>
	    <primary>debug</primary>
	  </indexterm>
	  <indexterm>
	    <primary>back-ends</primary>
	    <secondary>debug</secondary>
	  </indexterm>
	  <para>Generate a parser that will print debugging
	  information to <literal>stderr</literal> at run-time,
	  including all the shifts, reductions, state transitions and
	  token inputs performed by the parser.</para>

	  <para>This option can only be used in conjunction with
	  <option>-a</option>.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
        <term><option>-l</option></term>
        <term><option>--glr</option></term>
        <listitem>
          <indexterm>
            <primary>glr</primary>
          </indexterm>
          <indexterm>
            <primary>back-ends</primary>
            <secondary>glr</secondary>
          </indexterm>
          <para>Generate a GLR parser for ambiguous grammars.</para>
        </listitem>
      </varlistentry>

      <varlistentry>
        <term><option>-k</option></term>
        <term><option>--decode</option></term>
        <listitem>
          <indexterm>
            <primary>decode</primary>
          </indexterm>
          <para>Generate simple decoding code for GLR result.</para>
        </listitem>
      </varlistentry>

      <varlistentry>
        <term><option>-f</option></term>
        <term><option>--filter</option></term>
        <listitem>
          <indexterm>
            <primary>filter</primary>
          </indexterm>
          <para>Filter the GLR parse forest with respect to semantic usage.</para>
        </listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-?</option></term>
	<term><option>--help</option></term>
	<listitem>
	  <para>Print usage information on standard output then exit
	  successfully.</para>
	</listitem>
      </varlistentry>

      <varlistentry>
	<term><option>-V</option></term>
	<term><option>--version</option></term>
	<listitem>
	  <para>Print version information on standard output then exit
	  successfully. Note that for legacy reasons <option>-v</option>
	  is supported, too, but the use of it is deprecated.
	  <option>-v</option> will be used for verbose mode when it is
	  actually implemented.</para>
	</listitem>
      </varlistentry>

    </variablelist>

  </chapter>

  <chapter id="sec-grammar-files">
    <title>Syntax of Grammar Files</title>

    <para>The input to <application>Happy</application> is a text file containing
    the grammar of the language you want to parse, together with some
    annotations that help the parser generator make a legal Haskell
    module that can be included in your program.  This section gives
    the exact syntax of grammar files. </para>

    <para>The overall format of the grammar file is given below:</para>

<programlisting>
&lt;optional module header&gt;
&lt;directives&gt;
%%
&lt;grammar&gt;
&lt;optional module trailer&gt;
</programlisting>

    <indexterm>
      <primary>module</primary>
      <secondary>header</secondary>
    </indexterm>
    <indexterm>
      <primary>module</primary>
      <secondary>trailer</secondary>
    </indexterm>
    <para>If the name of the grammar file ends in <literal>.ly</literal>, then
    it is assumed to be a literate script.  All lines except those
    beginning with a <literal>&gt;</literal> will be ignored, and the
    <literal>&gt;</literal> will be stripped from the beginning of all the code
    lines.  There must be a blank line between each code section
    (lines beginning with <literal>&gt;</literal>) and comment section.
    Grammars not using the literate notation must be in a file with
    the <literal>.y</literal> suffix.</para>

    <sect1 id="sec-lexical-rules">
      <title>Lexical Rules</title>

<para>Identifiers in <application>Happy</application> grammar files must take the following form (using
the BNF syntax from the Haskell Report):</para>

<programlisting>
id      ::= alpha { idchar }
          | ' { any{^'} | \' } '
          | " { any{^"} | \" } "

alpha   ::= A | B | ... | Z
          | a | b | ... | z

idchar  ::= alpha
          | 0 | 1 | ... | 9
          | _
</programlisting>

    </sect1>

    <sect1 id="sec-module-header">
      <title>Module Header</title>

      <indexterm>
	<primary>module</primary>
	<secondary>header</secondary>
      </indexterm>
      <para>This section is optional, but if included takes the
      following form:</para>

<programlisting>
{
&lt;Haskell module header&gt;
}
</programlisting>

      <para>The Haskell module header contains the module name,
      exports, and imports.  No other code is allowed in the
      header&mdash;this is because <application>Happy</application> may need to include
      its own <literal>import</literal> statements directly after the user
      defined header.</para>

    </sect1>

    <sect1 id="sec-directives">
      <title>Directives</title>

      <para>This section contains a number of lines of the form:</para>

<programlisting>
%&lt;directive name&gt; &lt;argument&gt; ...
</programlisting>

      <para>The statements here are all annotations to help
      <application>Happy</application> generate the Haskell code for the grammar.
      Some of them are optional, and some of them are required.</para>

      <sect2 id="sec-token-type">
	<title>Token Type</title>

<programlisting>
%tokentype   { &lt;valid Haskell type&gt; }
</programlisting>

	<indexterm>
	  <primary><literal>%tokentype</literal></primary>
	</indexterm>
	<para>(mandatory) The <literal>%tokentype</literal> directive gives the
        type of the tokens passed from the lexical analyser to the
        parser (in order that <application>Happy</application> can supply types for
        functions and data in the generated parser).</para>

      </sect2>

      <sect2 id="sec-tokens">
	<title>Tokens</title>

<programlisting>
%token &lt;name&gt; { &lt;Haskell pattern&gt; }
       &lt;name&gt; { &lt;Haskell pattern&gt; }
       ...
</programlisting>

	<indexterm>
	  <primary><literal>%token</literal></primary>
	</indexterm>
	<para>(mandatory) The <literal>%token</literal> directive is used to
        tell <application>Happy</application> about all the terminal symbols used
        in the grammar.  Each terminal has a name, by which it is
        referred to in the grammar itself, and a Haskell
        representation enclosed in braces.  Each of the patterns must
        be of the same type, given by the <literal>%tokentype</literal>
        directive.</para>

	<para>The name of each terminal follows the lexical rules for
        <application>Happy</application> identifiers given above.  There are no
        lexical differences between terminals and non-terminals in the
        grammar, so it is recommended that you stick to a convention;
        for example using upper case letters for terminals and lower
        case for non-terminals, or vice-versa.</para>

	<para><application>Happy</application> will give you a warning if you try
        to use the same identifier both as a non-terminal and a
        terminal, or introduce an identifier which is declared as
        neither.</para>

	<para>To save writing lots of projection functions that map
        tokens to their components, you can include
        <literal>&dollar;&dollar;</literal> in your Haskell pattern. For
        example:</para>
	<indexterm>
	  <primary><literal>&dollar;&dollar;</literal></primary>
	</indexterm>

<programlisting>
%token INT { TokenInt $$ }
       ...
</programlisting>

<para>This makes the semantic value of <literal>INT</literal> refer to the first argument
of <literal>TokenInt</literal> rather than the whole token, eliminating the need for
any projection function.</para>

      </sect2>

      <sect2 id="sec-parser-name">
	<title>Parser Name</title>

<programlisting>
%name &lt;Haskell identifier&gt; [ &lt;non-terminal&gt; ]
...
</programlisting>
	<indexterm>
	  <primary><literal>%name</literal></primary>
	</indexterm>

	<para>(optional) The <literal>%name</literal> directive is followed by
        a valid Haskell identifier, and gives the name of the
        top-level parsing function in the generated parser.  This is
        the only function that needs to be exported from a parser
        module.</para>

	<para>If the <literal>%name</literal> directive is omitted, it
        defaults to <literal>happyParse</literal>.</para>
	<indexterm>
	  <primary><function>happyParse</function></primary>
	</indexterm>

	<para>The <literal>%name</literal> directive takes an optional
	second parameter which specifies the top-level non-terminal
	which is to be parsed.  If this parameter is omitted, it
	defaults to the first non-terminal defined in the
	grammar.</para>

	<para>Multiple <literal>%name</literal> directives may be
	given, specifying multiple parser entry points for this
	grammar (see <xref linkend="sec-multiple-parsers"/>).  When
	multiple <literal>%name</literal> directives are given, they
	must all specify explicit non-terminals.</para>
      </sect2>

      <sect2 id="sec-partial-parsers">
	<title>Partial Parsers</title>

<programlisting>
%partial &lt;Haskell identifier&gt; [ &lt;non-terminal&gt; ]
...
</programlisting>
	<indexterm>
	  <primary><literal>%partial</literal></primary>
	</indexterm>

	<para>The <literal>%partial</literal> directive can be used instead of
	  <literal>%name</literal>.  It indicates that the generated parser
	  should be able to parse an initial portion of the input.  In
	  contrast, a parser specified with <literal>%name</literal> will only
	  parse the entire input.</para>

	<para>A parser specified with <literal>%partial</literal> will stop
	  parsing and return a result as soon as there exists a complete parse,
	  and no more of the input can be parsed.  It does this by accepting
	  the parse if it is followed by the <literal>error</literal> token,
	  rather than insisting that the parse is followed by the
	  end of the token stream (or the <literal>eof</literal> token in the
	  case of a <literal>%lexer</literal> parser).</para>
      </sect2>

      <sect2 id="sec-monad-decl">
	<title>Monad Directive</title>

<programlisting>
%monad { &lt;type&gt; } { &lt;then&gt; } { &lt;return&gt; }
</programlisting>
	<indexterm>
	  <primary><literal>%monad</literal></primary>
	</indexterm>

	<para>(optional) The <literal>%monad</literal> directive takes three
        arguments: the type constructor of the monad, the
        <literal>then</literal> (or <literal>bind</literal>) operation, and the
        <literal>return</literal> (or <literal>unit</literal>) operation.  The type
        constructor can be any type with kind <literal>* -&gt; *</literal>.</para>

	<para>Monad declarations are described in more detail in <xref
        linkend="sec-monads"/>.</para>

      </sect2>

      <sect2 id="sec-lexer-decl">
	<title>Lexical Analyser</title>

<programlisting>
%lexer { &lt;lexer&gt; } { &lt;eof&gt; }
</programlisting>
	<indexterm>
	  <primary><literal>%lexer</literal></primary>
	</indexterm>

	<para>(optional) The <literal>%lexer</literal> directive takes two
        arguments: <literal>&lt;lexer&gt;</literal> is the name of the lexical
        analyser function, and <literal>&lt;eof&gt;</literal> is a token that
        is to be treated as the end of file.</para>

	<para>Lexer declarations are described in more detail in <xref
        linkend="sec-lexers"/>.</para>

      </sect2>

      <sect2 id="sec-prec-decls">
	<title>Precedence declarations</title>

<programlisting>
%left     &lt;name&gt; ...
%right    &lt;name&gt; ...
%nonassoc &lt;name&gt; ...
</programlisting>
	<indexterm>
	  <primary><literal>%left</literal> directive</primary>
	</indexterm>
	<indexterm>
	  <primary><literal>%right</literal> directive</primary>
	</indexterm>
	<indexterm>
	  <primary><literal>%nonassoc</literal> directive</primary>
	</indexterm>

	<para>These declarations are used to specify the precedences
	and associativity of tokens.  The precedence assigned by a
	<literal>%left</literal>, <literal>%right</literal> or
	<literal>%nonassoc</literal> declaration is defined to be
	higher than the precedence assigned by all declarations
	earlier in the file, and lower than the precedence assigned by
	all declarations later in the file.</para>

	<para>The associativity of a token relative to tokens in the
	same <literal>%left</literal>, <literal>%right</literal>, or
	<literal>%nonassoc</literal> declaration is to the left, to
	the right, or non-associative respectively.</para>

	<para>Precedence declarations are described in more detail in
	<xref linkend="sec-Precedences"/>.</para>
      </sect2>

      <sect2 id="sec-expect">
      	<title>Expect declarations</title>
<programlisting>
%expect &lt;number&gt;
</programlisting>
	<indexterm>
	  <primary><literal>%expect</literal> directive</primary>
	</indexterm>

	<para>(optional) More often than not the grammar you write
	will have conflicts. These conflicts generate warnings. But
	when you have checked the warnings and made sure that Happy
	handles them correctly these warnings are just annoying. The
	<literal>%expect</literal> directive gives a way of avoiding
	them. Declaring <literal>%expect
	<replaceable>n</replaceable></literal> is a way of telling
	Happy &ldquo;There are exactly <replaceable>n</replaceable>
	shift/reduce conflicts and zero reduce/reduce conflicts in
	this grammar. I promise I have checked them and they are
	resolved correctly&rdquo;.  When processing the grammar, Happy
	will check the actual number of conflicts against the
	<literal>%expect</literal> declaration if any, and if there is
	a discrepancy then an error will be reported.</para>

	<para>Happy's <literal>%expect</literal> directive works
	exactly like that of yacc.</para>
      </sect2>

      <sect2 id="sec-error-directive">
	<title>Error declaration</title>

<programlisting>
%error { &lt;identifier&gt; }
</programlisting>
	<indexterm>
	  <primary><literal>%error</literal></primary>
	</indexterm>

	<para>Specifies the function to be called in the event of a
	parse error.  The type of <literal>&lt;identifier&gt;</literal> varies
	depending on the presence of <literal>%lexer</literal> (see
	<xref linkend="sec-monad-summary" />) and <literal>%errorhandlertype</literal>
	(see the following).</para>
      </sect2>

      <sect2 id="sec-errorhandlertype-directive">
	<title>Additional error information</title>

<programlisting>
%errorhandlertype (explist | default)
</programlisting>

	<indexterm>
	  <primary><literal>%errorhandlertype</literal></primary>
	</indexterm>

	<para>(optional) The expected type of the user-supplied error handling can be
	applied with additional information. By default, no information is added, for
	compatibility with previous versions. However, if <literal>explist</literal>
	is provided with this directive, then the first application will be of
	type <literal>[String]</literal>, providing a description of possible tokens
	that would not have failed the parser in place of the token that has caused
	the error.
	</para>
      </sect2>

      <sect2 id="sec-attributes">
	<title>Attribute Type Declaration</title>
<programlisting>
%attributetype { &lt;valid Haskell type declaration&gt; }
</programlisting>
        <indexterm>
	  <primary><literal>%attributetype</literal> directive</primary>
	</indexterm>

	<para>(optional) This directive allows you to declare the type of the
	attributes record when defining an attribute grammar.  If this declaration
	is not given, Happy will choose a default.  This declaration may only
	appear once in a grammar.
	</para>
	<para>
	  Attribute grammars are explained in <xref linkend="sec-AttributeGrammar"/>.
	</para>
      </sect2>

      <sect2 id="sec-attribute">
	<title>Attribute declaration</title>
<programlisting>
%attribute &lt;Haskell identifier&gt; { &lt;valid Haskell type&gt; }
</programlisting>
        <indexterm>
         <primary><literal>%attribute</literal> directive</primary>
       </indexterm>

       <para>The presence of one or more of these directives declares that the
       grammar is an attribute grammar.  The first attribute listed becomes the
       default attribute.  Each <literal>%attribute</literal> directive generates a
       field in the attributes record with the given label and type.  If there
       is an <literal>%attributetype</literal> declaration in the grammar which
       introduces type variables, then the type of an attribute may mention any
       such type variables.
       </para>

       <para>
       Attribute grammars are explained in <xref linkend="sec-AttributeGrammar"/>.
       </para>
      </sect2>

    </sect1>

    <sect1 id="sec-grammar">
      <title>Grammar</title>

      <para>The grammar section comes after the directives, separated
      from them by a double-percent (<literal>%%</literal>) symbol.
      This section contains a number of
      <emphasis>productions</emphasis>, each of which defines a single
      non-terminal.  Each production has the following syntax:</para>
      <indexterm>
	<primary><literal>%%</literal></primary>
      </indexterm>

<programlisting>
&lt;non-terminal&gt; [ :: { &lt;type&gt; } ]
        :  &lt;id&gt; ... {[%] &lt;expression&gt; }
      [ |  &lt;id&gt; ... {[%] &lt;expression&gt; }
        ... ]
</programlisting>

      <para>The first line gives the non-terminal to be defined by the
      production and optionally its type (type signatures for
      productions are discussed in <xref
      linkend="sec-type-signatures"/>).</para>

      <para>Each production has at least one, and possibly many
      right-hand sides.  Each right-hand side consists of zero or more
      symbols (terminals or non-terminals) and a Haskell expression
      enclosed in braces.</para>

      <para>The expression represents the semantic value of the
      non-terminal, and may refer to the semantic values of the
    symbols in the right-hand side using the meta-variables
      <literal>&dollar;1 ... &dollar;n</literal>.  It is an error to
      refer to <literal>&dollar;i</literal> when <literal>i</literal>
      is larger than the number of symbols on the right hand side of
      the current rule. The symbol <literal>&dollar;</literal> may be
      inserted literally in the Haskell expression using the sequence
      <literal>\&dollar;</literal> (this isn't necessary inside a
      string or character literal).</para>

      <para>Additionally, the sequence <literal>&dollar;&gt;</literal>
      can be used to represent the value of the rightmost symbol.</para>

      <para>A semantic value of the form <literal>{% ... }</literal> is a
      <emphasis>monadic action</emphasis>, and is only valid when the grammar
      file contains a <literal>%monad</literal> directive (<xref
      linkend="sec-monad-decl"/>).  Monadic actions are discussed in
      <xref linkend="sec-monads"/>.</para>
      <indexterm>
	<primary>monadic</primary>
	<secondary>action</secondary>
      </indexterm>

      <para>Remember that all the expressions for a production must
      have the same type.</para>

      <sect2 id="sec-param-prods">
        <title>Parameterized Productions</title>
        Starting from version 1.17.1, <application>Happy</application> supports
        <emphasis>parameterized productions</emphasis> which provide a
        convenient notation for capturing recurring patterns in context free
        grammars. This gives the benefits of something similar to parsing
        combinators in the context of <application>Happy</application>
        grammars.
        <para>This functionality is best illustrated with an example:
<programlisting>
opt(p)          : p                   { Just $1 }
                |                     { Nothing }

rev_list1(p)    : p                   { [$1] }
                | rev_list1(p) p      { $2 : $1 }
</programlisting>
        The first production, <literal>opt</literal>, is used for optional
        components of a grammar.  It is just like <literal>p?</literal> in
        regular expressions or EBNF. The second production,
        <literal>rev_list1</literal>, is for parsing a list of 1 or more
        occurrences of <literal>p</literal>.  Parameterized productions are
        just like ordinary productions, except that they have parameter in
        parenthesis after the production name. Multiple parameters should
        be separated by commas:
<programlisting>
fst(p,q)        : p q                 { $1 }
snd(p,q)        : p q                 { $2 }
both(p,q)       : p q                 { ($1,$2) }
</programlisting>
        </para>

        <para>To use a parameterized production, we have to pass values for the
        parameters, as if we are calling a function.  The parameters can be
        either terminals, non-terminals, or other instantiations of
        parameterized productions.  Here are some examples:
<programlisting>
list1(p)        : rev_list1(p)        { reverse $1 }
list(p)         : list1(p)            { $1 }
                |                     { [] }
</programlisting>
        The first production uses <literal>rev_list</literal> to define
        a production that behaves like <literal>p+</literal>, returning
        a list of elements in the same order as they occurred in the input.
        The second one, <literal>list</literal> is like <literal>p*</literal>.
        </para>

        <para>Parameterized productions are implemented as a preprocessing
        pass in Happy:  each instantiation of a production turns into a
        separate non-terminal, but are careful to avoid generating the
        same rule multiple times, as this would lead to an ambiguous grammar.
        Consider, for example, the following parameterized rule:
<programlisting>
sep1(p,q)       : p list(snd(q,p))    { $1 : $2 }
</programlisting>
        The rules that would be generated for <literal>sep1(EXPR,SEP)</literal>
<programlisting>
sep1(EXPR,SEP)
  : EXPR list(snd(SEP,EXPR))                { $1 : $2 }

list(snd(SEP,EXPR))
  : list1(snd(SEP,EXPR))                    { $1 }
  |                                         { [] }

list1(snd(SEP,EXPR))
  : rev_list1(snd(SEP,EXPR))                { reverse $1 }

rev_list1(snd(SEP,EXPR))
  : snd(SEP,EXPR))                          { [$1] }
  | rev_list1(snd(SEP,EXPR)) snd(SEP,EXPR)  { $2 : $1 }

snd(SEP,EXPR)
  : SEP EXPR                                { $2 }
</programlisting>
        Note that this is just a normal grammar, with slightly strange names
        for the non-terminals.
        </para>

        <para>A drawback of the current implementation is that it does not
        support type signatures for the parameterized productions, that
        depend on the types of the parameters.  We plan to implement that
        in the future---the current workaround is to omit the type signatures
        for such rules.
        </para>
      </sect2>

      </sect1>

    <sect1 id="sec-module-trailer">
      <title>Module Trailer</title>
      <indexterm>
	<primary>module</primary>
	<secondary>trailer</secondary>
      </indexterm>

      <para>The module trailer is optional, comes right at the end of
      the grammar file, and takes the same form as the module
      header:</para>

<programlisting>
{
&lt;Haskell code&gt;
}
</programlisting>

      <para>This section is used for placing auxiliary definitions
      that need to be in the same module as the parser.  In small
      parsers, it often contains a hand-written lexical analyser too.
      There is no restriction on what can be placed in the module
      trailer, and any code in there is copied verbatim into the
      generated parser file.</para>

      </sect1>
    </chapter>

  <chapter id="sec-info-files">
    <title>Info Files</title>
    <indexterm>
      <primary>info files</primary>
    </indexterm>

    <para>
      Happy info files, generated using the <literal>-i</literal> flag,
      are your most important tool for debugging errors in your grammar.
      Although they can be quite verbose, the general concept behind
      them is quite simple.
    </para>

    <para>
      An info file contains the following information:
    </para>

    <orderedlist>
      <listitem>
        <para>A summary of all shift/reduce and reduce/reduce
          conflicts in the grammar.</para>
      </listitem>
      <listitem>
        <para>Under section <literal>Grammar</literal>, a summary of all the rules in the grammar.  These rules correspond directly to your input file, absent the actual Haskell code that is to be run for each rules.  A rule is written in the form <literal>&lt;non-terminal&gt; -> &lt;id&gt; ...</literal></para>
      </listitem>
      <listitem>
        <para>Under section <literal>Terminals</literal>, a summary of all the terminal tokens you may run against, as well as a the Haskell pattern which matches against them.  This corresponds directly to the contents of your <literal>%token</literal> directive (<xref linkend="sec-tokens"/>).</para>
      </listitem>
      <listitem>
        <para>Under section <literal>Non-terminals</literal>, a summary of which rules apply to which productions.  This is generally redundant with the <literal>Grammar</literal> section.</para>
      </listitem>
      <listitem>
        <para>The primary section <literal>States</literal>, which describes the state-machine Happy built for your grammar, and all of the transitions for each state.</para>
      </listitem>
      <listitem>
        <para>Finally, some statistics <literal>Grammar Totals</literal> at the end of the file.</para>
      </listitem>
    </orderedlist>
    <para>In general, you will be most interested in the <literal>States</literal> section, as it will give you information, in particular, about any conflicts your grammar may have.</para>

    <sect1 id="sec-info-files-states">
      <title>States</title>
      <para>Although Happy does its best to insulate you from the
        vagaries of parser generation, it's important to know a little
        about how shift-reduce parsers work in order to be able to
        interpret the entries in the <literal>States</literal>
        section.</para>

      <para>In general, a shift-reduce parser operates by maintaining
        parse stack, which tokens and productions are shifted onto or
        reduced off of.  The parser maintains a state machine, which
        accepts a token, performs some shift or reduce, and transitions
        to a new state for the next token.  Importantly, these states
        represent <emphasis>multiple</emphasis> possible productions,
        because in general the parser does not know what the actual
        production for the tokens it's parsing is going to be.
        There's no direct correspondence between the state-machine
        and the input grammar; this is something you have to
        reverse engineer.</para>

      <para>With this knowledge in mind, we can look at two example states
        from the example grammar from <xref linkend="sec-using" />:
      </para>

<programlisting>
State 5

        Exp1 -> Term .                                      (rule 5)
        Term -> Term . '*' Factor                           (rule 6)
        Term -> Term . '/' Factor                           (rule 7)

        in             reduce using rule 5
        '+'            reduce using rule 5
        '-'            reduce using rule 5
        '*'            shift, and enter state 11
        '/'            shift, and enter state 12
        ')'            reduce using rule 5
        %eof           reduce using rule 5

State 9

        Factor -> '(' . Exp ')'                             (rule 11)

        let            shift, and enter state 2
        int            shift, and enter state 7
        var            shift, and enter state 8
        '('            shift, and enter state 9

        Exp            goto state 10
        Exp1           goto state 4
        Term           goto state 5
        Factor         goto state 6
</programlisting>

      <para>For each state, the first set of lines describes the
        <emphasis>rules</emphasis> which correspond to this state.  A
        period <literal>.</literal> is inserted in the production to
        indicate where, if this is indeed the correct production, we
        would have parsed up to. In state 5, there are multiple rules,
        so we don't know if we are parsing an <literal>Exp1</literal>, a
        multiplication or a division (however, we do know there is a
        <literal>Term</literal> on the parse stack); in state 9, there
        is only one rule, so we know we are definitely parsing a
        <literal>Factor</literal>.</para>

      <para>The next set of lines specifies the action and state
        transition that should occur given a token.  For example, if in
        state 5 we process the <literal>'*'</literal> token, this token
        is shifted onto the parse stack and we transition to the state
        corresponding to the rule <literal>Term -> Term '*' .
          Factor</literal> (matching the token disambiguated which state
        we are in.)</para>

      <para>Finally, for states which shift on non-terminals,
        there will be a last set of lines saying what should be done
        after the non-terminal has been fully parsed; this information
        is effectively the stack for the parser.  When a reduce occurs,
        these goto entries are used to determine what the next
      state should be.</para>

      <!-- Probably could improve this section by walking through
      parsing -->

    </sect1>

    <sect1 id="sec-info-files-conflicts">
      <title>Interpreting conflicts</title>

      <para>When you have a conflict, you will see an entry like this
      in your info file:</para>

<programlisting>
State 432

        atype -> SIMPLEQUOTE '[' . comma_types0 ']'         (rule 318)
        sysdcon -> '[' . ']'                                (rule 613)

        '_'            shift, and enter state 60
        'as'           shift, and enter state 16

...

        ']'            shift, and enter state 381
                        (reduce using rule 328)

...
</programlisting>

      <para>On large, complex grammars, determining what the conflict is
        can be a bit of an art, since the state with the conflict may
        not have enough information to determine why a conflict is
        occurring).</para>

        <para>In some cases, the rules associated with the state with
          the conflict will immediately give you enough guidance to
          determine what the ambiguous syntax is.
          For example, in the miniature shift/reduce conflict
          described in <xref linkend="sec-conflict-tips" />,
          the conflict looks like this:</para>

<programlisting>
State 13

        exp -> exp . '+' exp0                               (rule 1)
        exp0 -> if exp then exp else exp .                  (rule 3)

        then           reduce using rule 3
        else           reduce using rule 3
        '+'            shift, and enter state 7
                        (reduce using rule 3)

        %eof           reduce using rule 3
</programlisting>

<para>Here, rule 3 makes it easy to imagine that we had been parsing a
  statement like <literal>if 1 then 2 else 3 + 4</literal>; the conflict
  arises from whether or not we should shift (thus parsing as
  <literal>if 1 then 2 else (3 + 4)</literal>) or reduce (thus parsing
  as <literal>(if 1 then 2 else 3) + 4</literal>).</para>

<para>Sometimes, there's not as much helpful context in the error message;
take this abridged example from GHC's parser:</para>

<programlisting>
State 49

        type -> btype .                                     (rule 281)
        type -> btype . '->' ctype                          (rule 284)

        '->'           shift, and enter state 472
                        (reduce using rule 281)
</programlisting>

<para>A pair of rules like this doesn't always result in a shift/reduce
  conflict: to reduce with rule 281 implies that, in some context when
  parsing the non-terminal <literal>type</literal>, it is possible for
  an <literal>'->'</literal> to occur immediately afterwards (indeed
  these source rules are factored such that there is no rule of the form
  <literal>... -> type '->' ...</literal>).</para>

<para>The best way this author knows how to sleuth this out is to
  look for instances of the token and check if any of the preceeding
  non-terminals could terminate in a type:</para>

<programlisting>
        texp -> exp '->' texp                              (500)
        exp -> infixexp '::' sigtype                       (414)
        sigtype -> ctype                                   (260)
        ctype -> type                                      (274)
</programlisting>

<para>As it turns out, this shift/reduce conflict results from
  ambiguity for <emphasis>view patterns</emphasis>, as in
  the code sample <literal>case v of { x :: T -&gt; T ... }</literal>.</para>

    </sect1>

  </chapter>

  <chapter id="sec-tips">
    <title>Tips</title>

    <para>This section contains a lot of accumulated lore about using
    <application>Happy</application>.</para>

    <sect1 id="sec-performance-tips">
      <title>Performance Tips</title>

      <para>How to make your parser go faster:</para>

      <itemizedlist>

	<listitem>
	  <para> If you are using GHC
          <indexterm>
	    <primary>GHC</primary>
	  </indexterm>
	  , generate parsers using the
          <literal>-a -g -c</literal> options, and compile them using GHC with
          the <literal>-fglasgow-exts</literal> option.  This is worth a
          <emphasis>lot</emphasis>, in terms of compile-time,
          execution speed and binary size.<footnote><para>omitting the
          <literal>-a</literal> may generate slightly faster parsers,
          but they will be much bigger.</para></footnote></para>
	</listitem>

	<listitem>
	  <para> The lexical analyser is usually the most performance
          critical part of a parser, so it's worth spending some time
          optimising this.  Profiling tools are essential here.  In
          really dire circumstances, resort to some of the hacks that
          are used in the Glasgow Haskell Compiler's interface-file
          lexer.</para>
	</listitem>

	<listitem>
	  <para> Simplify the grammar as much as possible, as this
          reduces the number of states and reduction rules that need
          to be applied.</para>
	</listitem>

	<listitem>
	  <para> Use left recursion rather than right recursion
          <indexterm>
	    <primary>recursion, left vs. right</primary>
	  </indexterm>
          wherever possible.  While not strictly a performance issue,
          this affects the size of the parser stack, which is kept on
          the heap and thus needs to be garbage collected.</para>
	</listitem>

      </itemizedlist>


    </sect1>

    <sect1 id="sec-compilation-time">
      <title>Compilation-Time Tips</title>

      <para>We have found that compiling parsers generated by
      <application>Happy</application> can take a large amount of time/memory, so
      here's some tips on making things more sensible:</para>

      <itemizedlist>

	<listitem>
	  <para> Include as little code as possible in the module
          trailer.  This code is included verbatim in the generated
          parser, so if any of it can go in a separate module, do
          so.</para>
	</listitem>

	<listitem>
          <para> Give type signatures
	  <indexterm>
	    <primary>type</primary>
	    <secondary>signatures in grammar</secondary>
	  </indexterm>
	  for everything (see <xref
          linkend="sec-type-signatures"/>.  This is reported to improve
          things by about 50%.  If there is a type signature for every
          single non-terminal in the grammar, then <application>Happy</application>
          automatically generates type signatures for most functions
          in the parser.</para>
	</listitem>

	<listitem>
	  <para> Simplify the grammar as much as possible (applies to
          everything, this one).</para>
	</listitem>

	<listitem>
	  <para> Use a recent version of GHC.  Versions from 4.04
	  onwards have lower memory requirements for compiling
	  <application>Happy</application>-generated parsers.</para>
	</listitem>

	<listitem>
	  <para> Using <application>Happy</application>'s <literal>-g -a -c</literal>
	  options when generating parsers to be compiled with GHC will
	  help considerably.</para>
	</listitem>

      </itemizedlist>

    </sect1>

    <sect1 id="sec-finding-errors">
      <title>Finding Type Errors</title>

      <indexterm>
	<primary>type</primary>
	<secondary>errors, finding</secondary>
      </indexterm>

      <para>Finding type errors in grammar files is inherently
      difficult because the code for reductions is moved around before
      being placed in the parser.  We currently have no way of passing
      the original filename and line numbers to the Haskell compiler,
      so there is no alternative but to look at the parser and match
      the code to the grammar file.  An info file (generated by the
      <literal>-i</literal> option) can be helpful here.</para>

      <indexterm>
	<primary>type</primary>
	<secondary>signatures in grammar</secondary>
      </indexterm>

      <para>Type signature sometimes help by pinning down the
      particular error to the place where the mistake is made, not
      half way down the file.  For each production in the grammar,
      there's a bit of code in the generated file that looks like
      this:</para>

<programlisting>
HappyAbsSyn&lt;n&gt; ( E )
</programlisting>
      <indexterm>
	<primary><literal>HappyAbsSyn</literal></primary>
      </indexterm>

      <para>where <literal>E</literal> is the Haskell expression from the
      grammar file (with <literal>&dollar;n</literal> replaced by
      <literal>happy_var_n</literal>).  If there is a type signature for this
      production, then <application>Happy</application> will have taken it into
      account when declaring the HappyAbsSyn datatype, and errors in
      <literal>E</literal> will be caught right here.  Of course, the error may
      be really caused by incorrect use of one of the
      <literal>happy_var_n</literal> variables.</para>

      <para>(this section will contain more info as we gain experience
      with creating grammar files.  Please send us any helpful tips
      you find.)</para>

    </sect1>

    <sect1 id="sec-conflict-tips">
      <title>Conflict Tips</title>
      <indexterm>
	<primary>conflicts</primary>
      </indexterm>

      <para>Conflicts arise from ambiguities in the grammar.  That is,
      some input sequences may possess more than one parse.
      Shift/reduce conflicts are benign in the sense that they are
      easily resolved (<application>Happy</application> automatically selects the
      shift action, as this is usually the intended one).
      Reduce/reduce conflicts are more serious.  A reduce/reduce
      conflict implies that a certain sequence of tokens on the input
      can represent more than one non-terminal, and the parser is
      uncertain as to which reduction rule to use.  It will select the
      reduction rule uppermost in the grammar file, so if you really
      must have a reduce/reduce conflict you can select which rule
      will be used by putting it first in your grammar file.</para>

      <para>It is usually possible to remove conflicts from the
      grammar, but sometimes this is at the expense of clarity and
      simplicity.  Here is a cut-down example from the grammar of
      Haskell (1.2):</para>

<programlisting>
exp     : exp op exp0
        | exp0

exp0    : if exp then exp else exp
        ...
        | atom

atom    : var
        | integer
        | '(' exp ')'
        ...
</programlisting>

      <para>This grammar has a shift/reduce conflict, due to the
      following ambiguity.  In an input such as</para>

<programlisting>
if 1 then 2 else 3 + 4
</programlisting>

      <para>the grammar doesn't specify whether the parse should be</para>

<programlisting>
if 1 then 2 else (3 + 4)
</programlisting>

      <para>or</para>

<programlisting>
(if 1 then 2 else 3) + 4
</programlisting>

      <para>and the ambiguity shows up as a shift/reduce conflict on
      reading the 'op' symbol.  In this case, the first parse is the
      intended one (the 'longest parse' rule), which corresponds to
      the shift action.  Removing this conflict relies on noticing
      that the expression on the left-hand side of an infix operator
      can't be an <literal>exp0</literal> (the grammar previously said
      otherwise, but since the conflict was resolved as shift, this
      parse was not allowed).  We can reformulate the
      <literal>exp</literal> rule as:</para>

<programlisting>
exp     : atom op exp
        | exp0
</programlisting>

      <para>and this removes the conflict, but at the expense of some
      stack space while parsing (we turned a left-recursion into a
      right-recursion).  There are alternatives using left-recursion,
      but they all involve adding extra states to the parser, so most
      programmers will prefer to keep the conflict in favour of a
      clearer and more efficient parser.</para>

      <sect2 id="sec-lalr">
	<title>LALR(1) parsers</title>

	<para>There are three basic ways to build a shift-reduce
        parser.  Full LR(1) (the `L' is the direction in which the
        input is scanned, the `R' is the way in which the parse is
        built, and the `1' is the number of tokens of lookahead)
        generates a parser with many states, and is therefore large
        and slow.  SLR(1) (simple LR(1)) is a cut-down version of
        LR(1) which generates parsers with roughly one-tenth as many
        states, but lacks the power to parse many grammars (it finds
        conflicts in grammars which have none under LR(1)). </para>

	<para>LALR(1) (look-ahead LR(1)), the method used by
        <application>Happy</application> and
        <application>yacc</application>, is tradeoff between the two.
        An LALR(1) parser has the same number of states as an SLR(1)
        parser, but it uses a more complex method to calculate the
        lookahead tokens that are valid at each point, and resolves
        many of the conflicts that SLR(1) finds.  However, there may
        still be conflicts in an LALR(1) parser that wouldn't be there
        with full LR(1).</para>

      </sect2>
    </sect1>

    <sect1 id="sec-happy-ghci">
      <title>Using Happy with <application>GHCi</application></title>
      <indexterm><primary><application>GHCi</application></primary>
      </indexterm>

      <para><application>GHCi</application>'s compilation manager
      doesn't understand Happy grammars, but with some creative use of
      macros and makefiles we can give the impression that
      <application>GHCi</application> is invoking Happy
      automatically:</para>

      <itemizedlist>
	<listitem>
	  <para>Create a simple makefile, called
	  <filename>Makefile_happysrcs</filename>:</para>

<programlisting>HAPPY = happy
HAPPY_OPTS =

all: MyParser.hs

%.hs: %.y
	$(HAPPY) $(HAPPY_OPTS) $&lt; -o $@</programlisting>
	</listitem>

	<listitem>
	  <para>Create a macro in GHCi to replace the
          <literal>:reload</literal> command, like so (type this all
          on one line):</para>

<screen>:def myreload (\_ -> System.system "make -f Makefile_happysrcs"
   >>= \rr -> case rr of { System.ExitSuccess -> return ":reload" ;
                           _ -> return "" })</screen>
	</listitem>

	<listitem>
	  <para>Use <literal>:myreload</literal>
	  (<literal>:my</literal> will do) instead of
	  <literal>:reload</literal> (<literal>:r</literal>).</para>
	</listitem>
      </itemizedlist>
    </sect1>

    <sect1 id="sec-monad-alex">
      <title>Basic monadic Happy use with Alex</title>
      <indexterm>
        <primary><application>Alex</application></primary>
        <secondary>monad</secondary>
      </indexterm>

      <para>
        <application>Alex</application> lexers are often used by
        <application>Happy</application> parsers, for example in
        GHC. While many of these applications are quite sophisticated,
        it is still quite useful to combine the basic
        <application>Happy</application> <literal>%monad</literal>
        directive with the <application>Alex</application>
        <literal>monad</literal> wrapper. By using monads for both,
        the resulting parser and lexer can handle errors far more
        gracefully than by throwing an exception.
      </para>

      <para>
        The most straightforward way to use a monadic
        <application>Alex</application> lexer is to simply use the
        <literal>Alex</literal> monad as the
        <application>Happy</application> monad:
      </para>

      <example><title>Lexer.x</title>
<programlisting>{
module Lexer where
}

%wrapper "monad"

tokens :-
  ...

{
data Token = ... | EOF
  deriving (Eq, Show)

alexEOF = return EOF
}</programlisting></example>
      <example><title>Parser.y</title>
<programlisting>{
module Parser where

import Lexer
}

%name pFoo
%tokentype { Token }
%error { parseError }
%monad { Alex } { >>= } { return }
%lexer { lexer } { EOF }

%token
  ...

%%
  ...

parseError :: Token -> Alex a
parseError _ = do
  ((AlexPn _ line column), _, _, _) &lt;- alexGetInput
  alexError ("parse error at line " ++ (show line) ++ ", column " ++ (show column))

lexer :: (Token -> Alex a) -> Alex a
lexer = (alexMonadScan >>=)
}</programlisting></example>

      <para>
        We can then run the finished parser in the
        <literal>Alex</literal> monad using
        <literal>runAlex</literal>, which returns an
        <literal>Either</literal> value rather than throwing an
        exception in case of a parse or lexical error:
      </para>

<programlisting>
import qualified Lexer as Lexer
import qualified Parser as Parser

parseFoo :: String -> Either String Foo
parseFoo s = Lexer.runAlex s Parser.pFoo
</programlisting>

    </sect1>
  </chapter>
  <index/>
</book>