-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBibTeX.hs
209 lines (166 loc) · 4.7 KB
/
BibTeX.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
{- |
The parsers in this module also skip trailing spaces.
-}
module BibTeX (
file,
comment,
entry,
assignment,
value,
texSequence,
texBlock,
identifier,
bibIdentifier,
-- utility functions
skippingSpace,
skippingLeadingSpace,
splitCommaSepList,
splitAuthorList,
splitSepList,
) where
import qualified Text.BibTeX.Entry as Entry
import qualified Text.ParserCombinators.Parsec.Token as T
import qualified Text.ParserCombinators.Parsec.Language as L
import qualified Text.ParserCombinators.Parsec as Parsec
import Text.ParserCombinators.Parsec
(CharParser, Parser,
(<|>), alphaNum, digit, letter, char, noneOf, oneOf,
between, many, many1, sepEndBy, )
import Control.Monad (liftM, liftM2, liftM3, )
import Data.List.HT (chop, )
import Data.Maybe
lexer :: T.TokenParser st
lexer =
T.makeTokenParser $ L.emptyDef {
L.commentLine = "%",
L.identStart = alphaNum,
L.identLetter = alphaNum
}
identifier, comma, equals :: CharParser st String
identifier = T.identifier lexer
comma = T.comma lexer
equals = T.symbol lexer "="
braces, lexeme :: CharParser st a -> CharParser st a
braces = T.braces lexer
lexeme = T.lexeme lexer
{- |
Beware that this and all other parsers do not accept leading spaces,
cf. 'skippingSpace'.
That is when encountering leading white spaces
the parser will just return an empty list.
If you want to parse a file that contains entirely of BibTeX data
you better call @skippingLeadingSpace file@ instead.
However, the @file@ parser is more combinable
and can be used for files that contain both BibTeX and other data
or it can be used for automated filetype checking.
-}
file :: Parser [Entry.T]
file = comment >> catMaybes <$> sepEndBy entry comment
comment :: Parser String
comment = many $ noneOf "@"
{- |
Parse a BibTeX entry like
> @article{author2010title,
> author = {Firstname Surname},
> title = {Title},
> year = 2010,
> month = jul,
> }
.
-}
entry :: Parser (Maybe Entry.T) -- could be onyl a comment
entry =
do entryType <- char '@' >> identifier
if entryType == "comment" then return Nothing
else fmap Just $ braces $
liftM2 (Entry.Cons entryType)
(Parsec.try bibIdentifier)
(comma >> sepEndBy assignment comma)
{- |
Parse an assignment like
> author = {Firstname Surname}
.
-}
assignment :: Parser (String, String)
assignment =
liftM2 (,)
bibIdentifier
(equals >> value)
{- |
Parse a value like
> jul
or
> 2010
or
> {Firstname Surname}
or
> "Firstname Surname"
.
-}
value :: Parser String
value =
lexeme (many1 letter) <|> -- for fields like: month = jul
lexeme (many1 digit) <|> -- for fields like: year = 2010
braces (texSequence '}') <|>
lexeme (between (char '"') (char '"') (texSequence '"'))
{- |
Parse a sequence of 'texBlock's until the occurrence of a closing character.
The closing character is not part of the result.
-}
texSequence :: Char -> Parser String
texSequence closeChar =
liftM concat (many (texBlock closeChar))
{- |
Parse a single character like @a@,
a LaTeX macro call like @\\alpha@
or a block enclosed in curly braces like @{\\\"{a}bc}@.
-}
texBlock :: Char -> Parser String
texBlock closeChar =
liftM3 (\open body close -> open : body ++ close : [])
(char '{') (texSequence '}') (char '}') <|>
sequence
[char '\\',
oneOf "_{}$|'`^&%\".,~# " <|> letter] <|>
fmap (:[]) (noneOf [closeChar])
{- |
Parse a name of a BibTeX entry like @author2010title@.
-}
bibIdentifier :: Parser String
bibIdentifier =
lexeme $
liftM2 (:) (alphaNum <|> char '_') -- less restrictive to handle all the internet shit!
(many (alphaNum <|> oneOf "&;:-_.?+/"))
{- |
Extends a parser, such that all trailing spaces are skipped.
It might be more comfortable to skip all leading spaces,
but parser written that way are hard to combine.
This is so, since if you run two parsers in parallel
and both of them expect leading spaces,
then the parser combinator does not know
which one of the parallel parsers to choose.
See also: 'lexeme'.
-}
skippingSpace :: Parser a -> Parser a
skippingSpace p =
do x <- p
Parsec.skipMany Parsec.space
return x
skippingLeadingSpace :: Parser a -> Parser a
skippingLeadingSpace p =
Parsec.skipMany Parsec.space >> p
-- * Convert contents of BibTeX fields into lists
{- |
Split a string at the commas and remove leading spaces.
-}
splitCommaSepList :: String -> [String]
splitCommaSepList = splitSepList ','
{- |
Split a string containing a list of authors in BibTeX notation.
-}
splitAuthorList :: String -> [String]
splitAuthorList =
map unwords . chop ("and" ==) . words
splitSepList :: Char -> String -> [String]
splitSepList sep =
map (dropWhile (' '==)) . chop (sep==)