Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

""" The pySUMO parsing module. It contains functions to parse and serialize 

kif-files. It also handles the Ontology's data structure and parses the mapping 

of SUMO terms to WordNet. 

 

 

This module contains: 

 

- AbstractSyntaxTree: The in-memory representation of an Ontology. 

- Ontology: Contains basic information about an Ontology. 

 

""" 

 

import re 

from io import StringIO 

 

from .logger import actionlog 

from enum import Enum 

from pickle import dumps 

 

def _tokenize_docstring(chars, f): 

    n = 0 

    ret = [] 

 

    while  chars.count('"')%2 != 0 : 

        n += 1 

        line = f.readline() 

        line = line.strip() 

        chars = "".join([chars, line]) 

    chars = chars.split('"') 

    while len(chars) > 1: 

        c = _tokenize(_cleanup(chars.pop(0))) 

        ret.extend(c) 

        ret.append("".join(['"',chars.pop(0),'"'])) 

    ret.extend(_tokenize(_cleanup(chars.pop(0)))) 

    return (ret, n) 

 

 

def _tokenize(chars): 

    return chars.replace('(', ' ( ').replace(')', ' ) ').split() 

 

def _cleanup(chars): 

    if '"' in chars and ';' in chars and chars.find(";") > chars.find('"'): 

        return chars 

    chars = chars.split(";") 

    chars = chars[0] 

    chars = chars.strip() 

    return chars 

 

def kifparse(infile, ontology, ast=None): 

    """ Parse an ontology and return an AbstractSyntaxTree. 

 

    Args: 

 

    - ontology: the ontology to parse 

    - graph: a modified graph of this ontology 

    - ast: the AST of ontologies which are needed from this ontology 

    - infile: the file object to parse 

 

    Returns: 

 

    - AbstractSyntaxTree 

 

    """ 

    root = AbstractSyntaxTree(ontology) 

    oldline = None 

    linenumber = -1 

    offset = 0 

    for i, line in enumerate(infile): 

        line = _cleanup(line) 

        if line == "": 

            continue 

        if '"' in line: 

            line, n = _tokenize_docstring(line, infile) 

            offset += n 

        else: 

            line = _tokenize(line) 

        if oldline != None: 

            line = oldline + line 

            oldline = None 

        if line[0] != '(': 

            raise ParseError(" ".join(line),  i+1) 

        if line.count('(') != line.count(')'): 

            if linenumber == -1: 

                linenumber = i + offset 

            oldline = line 

            continue 

        if linenumber == -1: 

            linenumber = i + offset 

        while line != [] and line.count('(') == line.count(')'): 

            node = AbstractSyntaxTree(ontology, line=linenumber) 

            parsed = node.parse(line) 

            line = line[parsed:] 

            root.add_child(node) 

        linenumber = -1 

    if oldline != None: 

        raise ParseError(" ".join(oldline), linenumber) 

    return root 

 

def astmerge(trees): 

    """ Merge two Abstract Syntax Trees 

 

    Args: 

 

    - trees: a tuple of 2 AST objects 

 

    Returns: 

 

    - AbstractSyntaxTree 

 

    """ 

    out = AbstractSyntaxTree(None) 

    out.children.extend(trees[0].children) 

    out.children.extend(trees[1].children) 

    return out 

 

def kifserialize(ast, ontology, out): 

    """ Writes ontology to disk as kif. Parses ast and writes out all nodes 

    that belong to ontology. 

 

    Args: 

 

    - ast: the Abstract Syntax Tree 

    - ontology: The specific ontology with is written to disk 

    - f: The file object witch written in 

 

    Raises: 

 

    - OSError 

 

    """ 

    for child in ast.children: 

        if child.ontology != ontology: 

            continue 

        line = "".join([str(child), '\n']) 

        out.write(line) 

 

WORDNET_REGEX = re.compile(r'^(\d{8}) (\d{2}) ([nvasr]) ([0-9a-zA-Z]{2})(?: (\S+ ([0-9a-zA-Z])))+ (\d{3})(?: ((\S{1,2}) \d{8} [nvasr] [0-9a-zA-Z]{4}))*(?: \d{2} (\+ \d{2} [0-9a-zA-Z]{2} )+)? ?\| .+ &%.+[\][@+:=]$') 

 

def wparse(datafiles): 

    """ Parses the file containing the SUMO-WordNet mapping. 

 

    Args: 

 

    - path: The path to the SUMO-WordNet mapping files 

 

    Returns: 

 

    - Dictionary 

 

    """ 

    mapping = dict() 

    total, processed = 0, 0 

    for data, pos in datafiles: 

        data.seek(0) 

        sdata = StringIO(data.read().decode('utf8')) 

        for line in sdata: 

            total += 1 

            #Syntactical validation 

            if WORDNET_REGEX.match(line): 

                items = _wtokenize(line.rstrip('\n'), pos) 

                processed += 1 

                for item in items: 

                    try: 

                        mapping[item.sumo_concept].add(item) 

                    except KeyError: 

                        mapping[item.sumo_concept] = {item} 

    #TODO: Remove the fixed assertions/replace with dynamic assertions 

    assert total == 117939, '%d lines were read, but %d lines should have been read' % (total, 117939) 

    assert processed >= 117659 - 2000, 'processed %d, should have processed %d' % (processed, 117659) 

    return mapping 

 

def _wtokenize(line, pos): 

    """ Returns all the tokens of a WordNet data line. """ 

    items = line.split(' ') 

    # byte offset in current file 

    synset_offset = int(items.pop(0)) 

    lex_filenum = int(items.pop(0)) 

    ss_type = SSType(items.pop(0)) 

    w_cnt = int(items.pop(0), 16) 

    synset = list() 

    for i in range(0, w_cnt): 

        word = items.pop(0) 

        if pos == Pos.adj and word[-1] == ')': 

            listy = word.split('(') 

            syn_marker = listy.pop()[:-1] 

            word = listy.pop() 

        else: 

            syn_marker = None 

        lex_id = int(items.pop(0), 16) 

        synset.append((word, syn_marker, lex_id)) 

    assert len(synset) == w_cnt, 'line %s has %d synsets, but should have %d' % (line, w_cnt, len(synset)) 

    p_cnt = int(items.pop(0)) 

    ptr_list = list() 

    for i in range(0, p_cnt): 

        pointer_symbol = items.pop(0) 

        synset_offset = int(items.pop(0)) 

        p_pos = Pos(items.pop(0)) 

        so_ta = items.pop(0) 

        source = int(so_ta[:2], 16) 

        target = int(so_ta[2:], 16) 

        ptr_list.append((pointer_symbol, p_pos, synset_offset, source, target)) 

    assert len(ptr_list) == p_cnt, 'line "%s" has %d pointers, but %s only contains %d' % (line, p_cnt, ptr_list, len(ptr_list)) 

    frames = None 

    if ss_type == SSType.verb: 

        f_cnt = int(items.pop(0)) 

        frames = set() 

        for i in range(0, f_cnt): 

            assert items.pop(0) == '+', "Frames not separated by a '+'" 

            f_num = int(items.pop(0)) 

            w_num = int(items.pop(0), 16) 

            frames.add((f_num, w_num)) 

        assert len(frames) == f_cnt, 'line %s has %d frames, but should have %d' % (line, f_cnt, len(frames)) 

    assert items.pop(0) == '|', "Missing '|' separator" 

    assert len(items) != 0, 'No gloss or SUMO-term in %s' % line 

    string = ' '.join(items) 

    assert string != '' 

    items = string.split('&%') 

    assert len(items) >= 2, '"%s": %s should contain at least 2 items, but contains %d' % (line, items, len(items)) 

    gloss = items.pop(0).rstrip() 

    sumo_concepts = set() 

    while len(items) > 0: 

        name = items.pop(0) 

        suffix = name[-1:] 

        name = name[:-1] 

        sumo_concepts.add(SUMOConceptWordNetItem(name, suffix, synset_offset, lex_filenum, ss_type, synset, ptr_list, frames, gloss)) 

    return sumo_concepts 

 

class SUMOConceptWordNetItem: 

    """ The object returned from _wtokenize containing info on the SUMO-WordNet mapping. """ 

    def __init__(self, sumo_concept, suffix, synset_offset, lex_filenum, 

                 ss_type, synset, ptr_list, frames, gloss): 

        self.sumo_concept = sumo_concept 

        self.suffix = suffix 

        self.synset_offset = synset_offset 

        self.lex_filenum = lex_filenum 

        self.ss_type = ss_type 

        self.synset = synset 

        self.ptr_list = ptr_list 

        self.frames = frames 

        self.gloss = gloss 

 

class Pos(Enum): 

    noun = 'n' 

    verb = 'v' 

    adj = 'a' 

    adv = 'r' 

 

class SSType(Enum): 

    noun = 'n' 

    verb = 'v' 

    adj = 'a' 

    adv = 'r' 

    adj_sat = 's' 

 

class AbstractSyntaxTree: 

    """ The AbstractSyntaxTree is a node in the abstract syntax tree. The 

    abstract syntax tree is defined by a root node and its children. The 

    AbstractSyntaxTree is the in-memory representation of the loaded Ontologies 

    for internal purposes only and should never be passed outside of the lib. 

 

    Variables: 

 

    - parent: The parent node 

    - children: A list of child nodes. 

    - name: The name of the AbstractSyntaxTree object. 

    - element_type: The type of the node element. 

    - ontology: The Ontology object to which this node corresponds. 

    - is_indexed: Whether or not this node is indexed. 

 

    Methods: 

 

    - add_child: Adds a child node. 

    - remove_child: Removes a child node. 

 

    """ 

 

    def __init__(self, ontology, parent=None, line=-1): 

        self.parent = None 

        self.children = [] 

        self.name = '' 

        self.element_type = '' 

        self.ontology = ontology 

        self.line = line 

 

    def __repr__(self): 

        if len(self.children) == 0: 

            return self.name 

        out = " ".join(["(", self.name, ""]) 

        for child in self.children: 

            out = "".join([out, str(child), " "]) 

        out = "".join([out, ")"]) 

        return out 

 

    def __eq__(self, other): 

        return dumps(self) == dumps(other) 

 

    def __ne__(self, other): 

        return not self.__eq__(other) 

 

    def __hash__(self): 

        return hash(dumps(self)) 

 

    def parse(self, tokens): 

        scip = 0 

        for i, token in enumerate(tokens): 

            if scip > 0: 

                scip -= 1 

                continue 

            if token == '(': 

                if self.name == '': 

                    self.name = tokens[i+1] 

                    scip += 1 

                else: 

                    child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line) 

                    scip += child.parse(tokens[i:]) 

                    scip -= 1 

                    self.add_child(child) 

            elif token == ')': 

                return i+1 

            else: 

                child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line) 

                child.name = token 

                self.add_child(child) 

 

    def add_child(self, entry): 

        """ Adds entry as a child to self. """ 

        # entry.parent = self 

        self.children.append(entry) 

 

    def remove_child(self, entry): 

        """ Removes entry from the node's children. """ 

        self.children.remove(entry) 

 

class ParseError(Exception): 

    def __init__(self, line, linenumber): 

        self.line = line 

        self.linnumber = linenumber 

 

    def __str__(self): 

        return "".join(["Parse error in line ", str(self.linnumber), "\n", self.line])