1
2
3 import time
4 import re
5 import glob
6 import os
7 import platform
8 import sys
9 import unicodedata
10 import urllib.parse
11 import codecs
12 import queue
13 from multiprocessing import Process, Event, Queue
14 from collections import Counter
15
16 baseDir = os.path.dirname(__file__)
17
18 sourceDir = os.path.join(baseDir, 'backup')
19 blacklist = (
20 'HilfeZurCreoleSyntax.txt',
21 )
22
23 class AnomalyFormatter:
24 """
25 Formats found anomalies and buffers the resuklting text.
26 The text buffer is returned and erased by getText().
27 Also counts found anomalies.
28 """
29
30 def __init__(self, textEscaper, textDecorator, maxPartLength=70):
31 self._buffer = []
32 self._escaper = textEscaper
33 self._decorator = textDecorator
34 self.maxPartLength = maxPartLength
35 self.qoute = '"'
36 self.ellipsis = '…'
37 self.sol = '|'
38 self.eol = '|'
39 self.minAfterLength = 20
40 self.counts = Counter()
41 self._lastPath = ''
42 self._lastLineNr = 0
43
44 def out(self, path, lineNr, startColumn, endColumn, line, anomaly):
45 b = self._buffer
46 d = self._decorator
47 q = self.qoute
48 if self._lastPath != path:
49 self._lastPath = path
50 self._lastLineNr = 0
51 self.counts['pathCount'] += 1
52 ePath = d.decorateText(self._escaper.escape(path), d.textBCyan)
53 pageName = os.path.basename(path).replace(' - ', '/')
54 if pageName[-4:] == '.txt':
55 pageName = pageName[0:-4]
56 url = 'https://larpwiki.de/' + urllib.parse.quote(pageName)
57 eUrl = d.decorateText(url, d.textWhite)
58 b.extend(('\n', ePath, ':\n'))
59 b.extend((' ', eUrl, '\n'))
60 if self._lastLineNr != lineNr:
61 if self._lastLineNr != lineNr:
62 self.counts['lineCount'] += 1
63 self._lastLineNr = lineNr
64 eLineNr = d.decorateText(str(lineNr + 1), d.textBYellow)
65 b.extend((' Line ', eLineNr, ':\n'))
66 self.counts['anomalyCount'] += 1
67 self.counts[anomaly] += 1
68 eColumn = d.decorateText(str(startColumn + 1), d.textBYellow)
69
70 ml = self.maxPartLength
71
72
73 t = self._escaper.escapeLimitRight(line[startColumn:endColumn], ml)
74 part = t[0]
75 partCpLength = t[1]
76 partComplete = ((endColumn - startColumn - partCpLength) == 0)
77 ml = max(0, ml - len(part))
78
79
80 if partComplete:
81 mal = min(len(line) - endColumn, int(ml / 2), self.minAfterLength)
82 else:
83 mal = 0
84 bLength = min(startColumn, ml - mal)
85 t = self._escaper.escapeLimitLeft(line[:startColumn], bLength)
86 before = t[0]
87 beforeCpLength = t[1]
88 ml = max(0, ml - len(before))
89
90
91 if partComplete:
92 t = self._escaper.escapeLimitRight(line[endColumn:], ml)
93 after = t[0]
94 afterCpLength = t[1]
95 else:
96 after = ''
97 afterCpLength = 0
98
99 if startColumn - beforeCpLength > 0:
100 sol = self.ellipsis
101 else:
102 sol = self.sol
103 if (startColumn + partCpLength + afterCpLength) < len(line):
104 eol = self.ellipsis
105 else:
106 eol = self.eol
107 before = d.decorateText(before, d.textYellow)
108 part = d.decorateText(part, d.textBYellow, d.textUnderline)
109 after = d.decorateText(after, d.textYellow)
110 b.extend((' Column ', eColumn, ', anomaly ', q, anomaly, q, ':\n'))
111 b.extend((' ', sol, q, before, part, after, q, eol, '\n'))
112
113 def getText(self):
114 text = ''.join(self._buffer)
115 self._buffer = []
116 return text
117
118 def getCounts(self):
119 counts = self.counts
120 self.counts = Counter()
121 return counts
122
123 class AnsiTextDecorator:
124 """
125 Colorizes output for ANSI terminals
126 """
127 textBlack = '30'
128 textRed = '31'
129 textGreen = '32'
130 textYellow = '33'
131 textBlue = '34'
132 textMagenta = '35'
133 textCyan = '36'
134 textGrey = '37'
135 textBGrey = '30;1'
136 textBRed = '31;1'
137 textBGreen = '32;1'
138 textBYellow = '33;1'
139 textBBlue = '34;1'
140 textBMagenta = '35;1'
141 textBCyan = '36;1'
142 textWhite = '37;1'
143 textBold = '1'
144 textItalic = '3'
145 textUnderline = '4'
146 backgroundBlack = '40'
147 backgroundRed = '41'
148 backgroundGreen = '42'
149 backgroundYellow = '43'
150 backgroundBlue = '44'
151 backgroundMagenta = '45'
152 backgroundCyan = '46'
153 backgroundGrey = '47'
154
155 def decorateText(self, text, *codes):
156 if not len(codes):
157 return text
158 codesStr = ''.join(('\x1B[' + code + 'm' for code in codes))
159 return '{0}{1}\x1B[0m'.format(codesStr, text)
160
161 class DummyTextDecorator(AnsiTextDecorator):
162
163 def decorateText(self, text, *codes):
164 return text
165
166 def makeTextDecorator(useAnsi=False):
167 if useAnsi:
168 return AnsiTextDecorator()
169 return DummyTextDecorator()
170
171 class TextEscaper:
172 """
173 Escapes non-printable code points except space (0x20).
174 """
175 def escape(self, text):
176 return repr(text)[1:-1].replace('"', r'\"')
177
178 def escapeLimitRight(self, text, maxLength):
179 if maxLength <= 0:
180 return '', 0
181 text = text[:maxLength]
182 textEsc = self.escape(text)
183 while len(textEsc) > maxLength:
184 text = text[0:-1]
185 textEsc = self.escape(text)
186 return textEsc, len(text)
187
188 def escapeLimitLeft(self, text, maxLength):
189 if maxLength <= 0:
190 return '', 0
191 text = text[-maxLength:]
192 textEsc = self.escape(text)
193 while len(textEsc) > maxLength:
194 text = text[1:]
195 textEsc = self.escape(text)
196 return textEsc, len(text)
197
198 _detectSmilieRe = re.compile(r'''(?:^|(?<=\s))
199 [:;,8B][-~]?[)}\]|({[pPD][=\#]?
200 (?:\s|$)''', re.VERBOSE)
201 def detectSmilie(line, offset):
202 """
203 Detects simple western LTR ASCII smilies like ";~P="
204
205 A smilie starts with a symbol for the eyes, followed by an optional symbol
206 for the nose and a symbol for the mouth.
207 A symbol for the beard may follow.
208 The smilie has to begin and end at the start/end of line or after/before
209 whitespace.
210 """
211 return _detectSmilieRe.match(line, offset) is not None
212
213 def checkForInvalidCodePoints(escaper, outputter, path, lineNr, line):
214 markAllowed = False
215 for cpIndex, cp in enumerate(line):
216 anomaly = True
217 unexpectedMark = False
218 cpCat = unicodedata.category(cp)
219 cpCatMain = cpCat[0]
220
221
222
223 if cpCatMain in 'LNPSZ' or cp in (
224 '\t',
225 '\xad',
226 '\u200d',
227 '\u200e',
228 None
229 ):
230 anomaly = False
231
232
233
234 if cp == '�':
235 anomaly = True
236
237
238 if cpCatMain == 'M':
239 if markAllowed:
240 anomaly = False
241 else:
242
243 anomaly, unexpectedMark = True, True
244 elif cpCatMain == 'L':
245 markAllowed = True
246 else:
247 markAllowed = False
248
249 if anomaly:
250 cpName = unicodedata.name(cp, 'unnamed')
251 if unexpectedMark:
252 suffix = ' not preceded by a letter'
253 else:
254 suffix = ''
255 msg = 'Unicode {0} ({1}, category {2}){3}'
256 msg = msg.format(escaper.escape(cp), cpName, cpCat, suffix)
257 outputter.out(path, lineNr, cpIndex, cpIndex + 1, line, msg)
258
259 _checkForUseModListRe = re.compile(r'(\*|#(\*|#([*#])))[*#]*')
260 def checkForUseModList(outputter, path, lineNr, line, isDirective, isComment):
261 match = _checkForUseModListRe.match(line)
262 if match:
263 isDirective, isComment = False, False
264 start = match.start()
265 end = match.end()
266 outputter.out(path, lineNr, start, end, line, 'UseMod list')
267 return isDirective, isComment
268
269 _checkForNonCommentAfterRedirectRe = re.compile(r'\s*(\S.*?)\s*$')
270 def detectNonCommentAfterRedirect(outputter, path, lineNr, line):
271 match = _checkForNonCommentAfterRedirectRe.match(line)
272 if match:
273 start = match.start(1)
274 end = match.end(1)
275 msg = 'Non-empty non-comment line after valid redirect'
276 outputter.out(path, lineNr, start, end, line, msg)
277 return True
278 return False
279
280 _detectRedirect = re.compile(r'#REDIRECT(\s*)(?P<name>.*)')
281 def detectRedirect(outputter, path, lineNr, line, firstDirectiveLine
282 , validRedirectPresent):
283 match = _detectRedirect.match(line)
284 if match:
285 if firstDirectiveLine:
286 name = match.group('name')
287 if not name:
288 msg = 'Redirect without target'
289 outputter.out(path, lineNr, 0, len(line), line, msg)
290 else:
291 validRedirectPresent = True
292 else:
293 msg = 'Redirect in non-first line'
294 outputter.out(path, lineNr, 0, len(line), line, msg)
295 return validRedirectPresent, True
296 return validRedirectPresent, False
297
298 def detectUseModIndent(outputter, path, lineNr, line):
299 if line[0:1] != ':' or detectSmilie(line, 0):
300 return False
301 end = len(line) - len(line.lstrip(';'))
302 outputter.out(path, lineNr, 0, end, line, 'UseMod indentation')
303 return True
304
305 def detectUseModDefinitionList(outputter, path, lineNr, line):
306 if line[0:1] != ';' or detectSmilie(line, 0):
307 return False
308 outputter.out(path, lineNr, 0, 1, line, 'UseMod definition list')
309 return True
310
311 _detectUseModTagsRe = re.compile(r'''<(?P<close>[/]?)
312 (?P<name>(b|i|nowiki|pre|toc|tt))
313 >''', re.IGNORECASE | re.VERBOSE)
314 def detectUseModTags(outputter, path, lineNr, line):
315 matches = _detectUseModTagsRe.finditer(line)
316 for match in matches:
317 start = match.start()
318 end = match.end()
319 closing = match.group('close')
320 tagName = match.group('name').lower()
321 tagType = 'close' if closing else 'open'
322 msg = 'UseMod tag {0} {1}'.format(tagName, tagType)
323 outputter.out(path, lineNr, start, end, line, msg)
324 return False
325
326 _checkBrTagsRe = re.compile(r'''
327 (?P<open><[<`]*)
328 (?P<name>br)
329 (?P<close>[>`]*>)
330 ''', re.IGNORECASE | re.VERBOSE)
331 def checkBrTags(outputter, path, lineNr, line):
332 """
333 UseMod forced linebreak: <br>
334 MoinMoin forced linebreak: <<BR>>
335 """
336 matches = _checkBrTagsRe.finditer(line)
337 for match in matches:
338 start = match.start()
339 end = match.end()
340 tagOpen = match.group('open')
341 tagName = match.group('name')
342 tagClose = match.group('close')
343 if (tagOpen == '<') and (tagClose == '>'):
344 msg = 'UseMod forced linebreak'
345 outputter.out(path, lineNr, start, end, line, msg)
346 return True
347 if ((tagOpen == '<<') and (tagClose[0:2] == '>>')
348 and (tagName != 'BR')):
349 msg = 'Invalid MoinMoin forced linebreak'
350 outputter.out(path, lineNr, start, end, line, msg)
351 return True
352 return False
353
354 _checkHeadlinesRe = re.compile(r'''
355 (?P<spaceBeforOpen>\s*) # Illegal.
356 (?P<openTag>[=]+) # Headline open tag.
357 (?P<spaceAfterOpen>\s*) # Required.
358 (?P<nIndicator>[\#*]*)\s* # Numbering from old wiki.
359 (?P<text>.*?) # Required headline text (non-greedy).
360 (?P<spaceBeforClose>\s*) # Required.
361 (?P<closeTag>[=]*) # Has to be same as open tag.
362 (?P<spaceAfterClose>\s*) # Illegal trailing whitespace.
363 $''', re.VERBOSE)
364 def checkHeadlines(outputter, path, lineNr, line):
365 match = _checkHeadlinesRe.match(line)
366 if match is None:
367 return False
368 spaceBeforOpen = match.group('spaceBeforOpen')
369 openTag = match.group('openTag')
370 openTagStart = match.start('openTag')
371 openTagEnd = match.end('openTag')
372 spaceAfterOpen = match.group('spaceAfterOpen')
373 nIndicator = match.group('nIndicator')
374 text = match.group('text')
375 spaceBeforClose = match.group('spaceBeforClose')
376 closeTag = match.group('closeTag')
377 spaceAfterClose = match.group('spaceAfterClose')
378 if spaceBeforOpen:
379 end = len(spaceBeforOpen)
380 msg = 'Headline after whitespace'
381 outputter.out(path, lineNr, 0, end, line, msg)
382 if len(openTag) > 5:
383 start = openTagStart
384 end = openTagEnd
385 msg = 'Headline of level > 5'
386 outputter.out(path, lineNr, start, end, line, msg)
387 if text:
388 iMatches = re.finditer(r"[`']{2,}", text)
389 for iMatch in iMatches:
390 start = match.start('text') + iMatch.start()
391 end = match.start('text') + iMatch.end()
392 msg = 'Headline contains markup'
393 outputter.out(path, lineNr, start, end, line, msg)
394 else:
395 end = len(line)
396 start = openTagEnd - 1
397 msg = 'Headline contains no text'
398 outputter.out(path, lineNr, start, end, line, msg)
399 return True
400 if not spaceAfterOpen:
401 if nIndicator:
402 start = match.start('nIndicator')
403 else:
404 start = match.start('text')
405 msg = 'Headline without whitespace after open tag'
406 outputter.out(path, lineNr, start, start + 1, line, msg)
407 if nIndicator:
408 start = match.start('nIndicator')
409 end = match.end('nIndicator')
410 msg = 'Headline with UseMod numbering indicator'
411 outputter.out(path, lineNr, start, end, line, msg)
412 if closeTag:
413 if len(openTag) != len(closeTag):
414 start = match.start('closeTag')
415 end = match.end('closeTag')
416 msg = ('Headline with different length open and close'
417 + ' tags')
418 outputter.out(path, lineNr, start, end, line, msg)
419 if not spaceBeforClose:
420 start = match.start('closeTag')
421 msg = 'Headline without whitespace before close tag'
422 outputter.out(path, lineNr, start, start + 1, line, msg)
423 if spaceAfterClose:
424 start = match.start('spaceAfterClose')
425 end = match.end('spaceAfterClose')
426 msg = 'Headline ends with whitespace'
427 outputter.out(path, lineNr, start, end, line, msg)
428 else:
429 msg = 'Headline without close tag'
430 outputter.out(path, lineNr, len(line)-1, len(line), line, msg)
431
432 return True
433
434 _checkLinksRe = re.compile(r'''
435 (?P<openBrackets>\[[\[`]*) # Valid links got 2 brackets
436 (?P<openQuote>"?) # Artifact from old wiki conversion
437 \s*
438 (?P<linkUrl>.*?) # Link URL (not greedy)
439 \s*
440 (?P<closeQuote>"?) # Artifact from old wiki conversion
441 (?P<closeBrackets>[\]`]*\]) # Valid links got 2 brackets
442 ''', re.IGNORECASE | re.VERBOSE)
443 def checkLinks(outputter, path, lineNr, line):
444 matches = _checkLinksRe.finditer(line)
445 for match in matches:
446 start = match.start()
447 end = match.end()
448 openBrackets = match.group('openBrackets')
449 openQuote = match.group('openQuote')
450 linkUrl = match.group('linkUrl')
451 if openQuote:
452 msg = 'Fail-converted unnamed internal UseMod link'
453 outputter.out(path, lineNr, start, end, line, msg)
454 continue
455 if (len(openBrackets) == 1) and re.search(r':', linkUrl):
456 msg = 'Fail-converted external UseMod link'
457 outputter.out(path, lineNr, start, end, line, msg)
458 continue
459 return False
460
461 _detectUseModUploadsRe = re.compile(r'(^|\s)(?P<link>upload:\S+)(\s|$)', re.I)
462 def detectUseModUploads(outputter, path, lineNr, line):
463 matches = _detectUseModUploadsRe.finditer(line)
464 for match in matches:
465 start = match.start('link')
466 end = match.end('link')
467 msg = 'UseMod upload link'
468 outputter.out(path, lineNr, start, end, line, msg)
469 return False
470
471
472 def detectMoinMoinComment(outputter, path, lineNr, line):
473 return line.startswith('##')
474
475 def makeCheckFile(checkFuns, cols, useAnsi):
476 escaper = TextEscaper()
477 decorator = makeTextDecorator(useAnsi)
478 maxPartLength = cols - 11
479 outputter = AnomalyFormatter(escaper, decorator, maxPartLength)
480
481 def checkFile(path):
482
483
484 with open(path, 'rb') as file:
485 textBytes = file.read()
486 decoder = codecs.getincrementaldecoder('utf-8')()
487 lines, line, invalidEncoding = [], [], False
488 lastI = len(textBytes) + 1
489 for i in range(0, len(textBytes)):
490 try:
491 cp = decoder.decode(textBytes[i:i+1], i == lastI)
492 if len(cp) != 0:
493 if cp == '\n':
494 if line[-1:] == ['\r']:
495 del line[-1]
496 lines.append(''.join(line))
497 line = []
498 else:
499 line.append(cp)
500 except ValueError:
501 invalidEncoding = True
502 lineNr, cpIndex = len(lines) + 1, len(line)
503 lineStr = ''.join(line)
504 msg = 'UTF-8 invalid byte while decoding line!'
505 outputter.out(path, lineNr, cpIndex, cpIndex + 1, lineStr, msg)
506 break
507 if invalidEncoding:
508 return outputter.getText(), tuple(outputter.getCounts().items())
509 lines.append(''.join(line))
510
511 firstDirectiveLine = 1
512 validRedirectPresent = False
513 for lineNr, line in enumerate(lines):
514 isComment = detectMoinMoinComment(outputter, path, lineNr, line)
515 isDirective = not isComment and line.startswith('#')
516
517 checkForInvalidCodePoints(escaper, outputter, path, lineNr
518 , line)
519
520 isDirective, isComment = checkForUseModList(outputter, path
521 , lineNr, line, isDirective, isComment)
522
523
524 if isComment:
525 continue
526
527
528 if (firstDirectiveLine == lineNr) and isComment:
529 firstDirectiveLine += 1
530
531
532 if validRedirectPresent and not isDirective:
533 skipRemaining = detectNonCommentAfterRedirect(outputter, path
534 , lineNr, line)
535 if skipRemaining:
536 continue
537
538 validRedirectPresent, skipRemaining = detectRedirect(outputter, path
539 , lineNr, line, firstDirectiveLine, validRedirectPresent)
540 if skipRemaining:
541 continue
542
543 if isDirective:
544
545 continue
546
547 for checkFun in checkFuns:
548 skipRemaining = checkFun(outputter, path, lineNr, line)
549 if skipRemaining:
550 continue
551
552 return outputter.getText(), tuple(outputter.getCounts().items())
553
554 return checkFile
555
556 def workerProc(termE:Event, jobs:Queue, results:Queue, workerFactory, *args):
557 try:
558 workFun = workerFactory(*args)
559 while not termE.is_set():
560 try:
561 job = jobs.get(True, 0.02)
562 except queue.Empty:
563 continue
564 result = job, workFun(job)
565 results.put(result, True)
566 except KeyboardInterrupt:
567 pass
568
569 def handleResults(results:Queue, counts:Counter):
570 while True:
571 try:
572 job, (rText, rCounts) = results.get(False)
573 except queue.Empty:
574 return
575 counts['fileCount'] += 1
576 if len(rText) != 0:
577 print(rText, end='')
578 for name, count in rCounts:
579 counts[name] += count
580
581 def main():
582 checkFuns = (
583 detectUseModIndent,
584 detectUseModDefinitionList,
585 detectUseModTags,
586 checkBrTags,
587 checkHeadlines,
588 checkLinks,
589 detectUseModUploads,
590 )
591 if sys.stdout.isatty() and (platform.system() != 'Windows'):
592 import subprocess
593 cols = int(subprocess.Popen(('tput', 'cols'),
594 stdout=subprocess.PIPE).stdout.read())
595 if cols <= 0:
596 cols = 80
597 useAnsi = True
598 else:
599 cols, useAnsi = 80, False
600
601 workerCount = max(1, len(os.sched_getaffinity(0)))
602 termE = Event()
603 jobs = Queue(maxsize=2*workerCount)
604 results = Queue(maxsize=2*workerCount)
605 workerArgs = termE, jobs, results, makeCheckFile, checkFuns, cols, useAnsi
606 workerPool = [Process(target=workerProc, args=workerArgs)
607 for _ in range(0, workerCount)]
608 for worker in workerPool:
609 worker.start()
610 counts = Counter()
611 blistedCount = 0
612 try:
613 print('Scanning files...')
614 paths = glob.iglob(os.path.join(sourceDir, "*.txt"))
615 for path in paths:
616 if not os.path.isfile(path):
617 continue
618 if path in blacklist:
619 blistedCount += 1
620 continue
621 while True:
622 handleResults(results, counts)
623 try:
624 jobs.put(path, True, 0.02)
625 break
626 except queue.Full:
627 pass
628 while not jobs.empty():
629 handleResults(results, counts)
630 time.sleep(0.02)
631 except KeyboardInterrupt:
632 print('')
633 print('Processing interrupted by user!')
634 termE.set()
635 while any(worker.is_alive() for worker in workerPool):
636 handleResults(results, counts)
637 time.sleep(0.02)
638 for worker in workerPool:
639 worker.join()
640 handleResults(results, counts)
641
642 decorator = makeTextDecorator(useAnsi)
643 fileCount, anomalyCount = counts['fileCount'], counts['anomalyCount']
644 pathCount, lineCount = counts['pathCount'], counts['lineCount']
645 del counts['fileCount']
646 del counts['anomalyCount']
647 del counts['pathCount']
648 del counts['lineCount']
649 eFileCount = decorator.decorateText(str(fileCount), decorator.textBYellow)
650 eBlistedCount = decorator.decorateText(str(blistedCount), decorator.textBYellow)
651 if anomalyCount != 0:
652 eAnomalyCount = decorator.decorateText(str(anomalyCount), decorator.textBYellow)
653 eLineCount = decorator.decorateText(str(lineCount), decorator.textBYellow)
654 ePathCount = decorator.decorateText(str(pathCount), decorator.textBYellow)
655 msg = ('Found {0} anomalies in {1} lines from {2} files'
656 + ' ({3} scanned, {4} excluded):')
657 print('')
658 print(msg.format(eAnomalyCount, eLineCount, ePathCount, eFileCount
659 , eBlistedCount))
660 maxValueLen = len(str(max(counts.values())))
661 for name, count in sorted(counts.items()):
662 eCount = '{0:{1}}'.format(count, maxValueLen)
663 eCount = decorator.decorateText(eCount, decorator.textBYellow)
664 print(' {0} {1}'.format(eCount, name))
665 else:
666 msg = 'Found no anomalies in {0} files ({1} excluded).'
667 print('')
668 print(msg.format(fileCount, eBlistedCount))
669
670 if __name__ == '__main__':
671 main()