#!/usr/bin/env python3
#
# Tilmann Haak <spam@thaak.de>
# 2011-12-04
# 2011-12-16 Allan Wegan <allanwegan@allanwegan.de>:
#   - Retries on network (or wiki) error.
# 2012-12-25 Allan Wegan <allanwegan@allanwegan.de>:
#   - Updated to Python 3.
#   - Now retrieves pages using less restrictive interface.
#   - Replaces "/" in file names with " - " instead of "%2f".
# 2013 Allan Wegan <allanwegan@allanwegan.de>:
#   - Outputs each page URL that is retrieved.
# 2017-12 Allan Wegan <allanwegan@allanwegan.de>:
#   - Replaced last use of XMLRPC (page index) with an HTML parser.
#   - Implemented attachment retrieval.
#   - Skips unretrievable pages (cgi-bin/wiki.pl) or attachments (handles 404).
#
# Downloads all pages and attachments from larpwiki.de to current folder.
# Writes each page into a text file, e.g. "LarpWiki.txt".
# Writes each pages' attachments into files, e.g. "LarpWiki.txt - image.jpg".
#
# Expects path to LARP-Wiki certificate chain in caChainFile. Set to None to use
# standard system CAs instead.

import sys
import time
import re
import os.path
import ssl
from glob import iglob
import urllib.parse
import urllib.error
import urllib.request
from html.parser import HTMLParser

baseDir = os.path.dirname(__file__)

baseUrl = "https://www.larpwiki.de/"
caChainFile = os.path.join(baseDir, "www.larpwiki.de.pem")
indexPage = "Admin/SiteIndex"

netRetryM = 0x7FFFFFFF # How often to retry each request before giving up.
netRetryD = 1.0 # Seconds to wait initially before retrying a failed request.
netRetryDFun = lambda oldDelay: oldDelay * 2.0

class UrlNotFound(Exception): pass

# Retries a given function at most retriesMax times after failing:
def retryOnError(fun, retriesMax, retryDelay, retryDelayFun):
  while True:
    # noinspection PyBroadException
    try:
      return fun()
    except KeyboardInterrupt:
      raise
    except Exception as ex:
      if isinstance(ex, urllib.error.HTTPError) and ex.code == 404:
        raise UrlNotFound(ex)
      if retriesMax < 1: raise
      print("Error:", sys.exc_info())
      print("Waiting %f seconds before retrying (Retries left: %i)..." % (
        retryDelay, retriesMax
      ))
      time.sleep(retryDelay)
      retriesMax -= 1
      retryDelay = retryDelayFun(retryDelay)
      continue

# Stores text in a file:
def writeFile(path, content):
  openMode = 'w' if isinstance(content, str) else 'wb'
  with open(path, openMode) as fd:
    fd.write(content)

def createSiteUrl(pageUrl, getRaw):
  tpl = "{baseUrl}{pageUrl}{rawArgs}"
  rawArgs = "?action=raw" if getRaw else ""
  return tpl.format(baseUrl=baseUrl, pageUrl=pageUrl, rawArgs=rawArgs)

def createAttachmentUrl(pageUrl, name):
  tpl = "{baseUrl}{pageUrl}?action=AttachFile&do=get&target={name}"
  name = urllib.parse.quote(name, safe='')
  return tpl.format(baseUrl=baseUrl, pageUrl=pageUrl, name=name)

def getSslContext():
  """
  Python may or may not fail to find system certificates based on version,
  distribution and user-side configuration.
  This should be able to work around that on almost any Unix or Linux.
  """
  # Fallback cert files or cert file folders:
  certfiles = (
    # Source: <https://golang.org/src/crypto/x509/root_linux.go>:
    "/etc/ssl/certs/ca-certificates.crt", # Debian/Ubuntu/Gentoo etc.
    "/etc/pki/tls/certs/ca-bundle.crt", # Fedora/RHEL 6
    "/etc/ssl/ca-bundle.pem", # OpenSUSE
    "/etc/pki/tls/cacert.pem", # OpenELEC
    "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", # CentOS/RHEL 7
    # Source: <https://golang.org/src/crypto/x509/root_unix.go>:
    "/etc/ssl/certs", # SLES10/SLES11, https://golang.org/issue/12139
    "/system/etc/security/cacerts", # Android
    "/usr/local/share/certs", # FreeBSD
    "/etc/pki/tls/certs", # Fedora/RHEL
    "/etc/openssl/certs", # NetBSD
  )
  sslCtx = ssl.create_default_context()
  for capath in certfiles:
    if sslCtx.cert_store_stats()['x509_ca'] != 0:
      break
    if os.path.isdir(capath):
      paths = iglob(os.path.join(capath,"*"))
    elif os.path.isfile(capath):
      paths = (capath,)
    else:
      paths = ()
    for path in paths:
      # noinspection PyBroadException
      try:
        sslCtx.load_verify_locations(cafile=path)
      except Exception:
        pass # Ignore files not containing certificates.
    #print(sslContext.cert_store_stats(), capath)
  if sslCtx.cert_store_stats()['x509_ca'] == 0:
    raise Exception("System CA certificates not found!")
  return sslCtx

def getUrlContent(sslContext, url):
  def f():
    return urllib.request.urlopen(url, context=sslContext)
  response = retryOnError(f, netRetryM, netRetryD, netRetryDFun)
  return response.read()

def getPageText(sslContext, url):
  return getUrlContent(sslContext, url).decode("utf-8", "replace")

# noinspection PyAbstractClass
class IndexPageParser(HTMLParser):

  def __init__(self):
    super().__init__()
    self.pages = []
    self._tags = [
      # (isOpening, tag, attribs)
      ((True, 'h2', (('id', 'Wiki-Seiten'),), self._removeFirstFromStack),),
      ((False, 'h2', (), self._removeFirstFromStack),),
      ((True, 'ul', (), self._removeFirstFromStack),),
      (
        (False, 'ul', (), self._removeFirstFromStack),
        (True, 'a', (), self._parsePageLinkOpen),
      ),
    ]

  # noinspection PyUnusedLocal
  def _removeFirstFromStack(self, isOpen, tag, attrs):
    del self._tags[0]

  # noinspection PyUnusedLocal
  def _parsePageLinkOpen(self, isOpen, tag, attrs):
    url = attrs['href']
    if url[0:1] == '/':
      url = url[1:]
    self.pages.append(url)

  def handleTag(self, isOpen, tag, attrs):
    attrs = dict(attrs)
    if len(self._tags) == 0:
      return
    for cOpen, cTag, cAttrs, onMatch in self._tags[0]:
      if (cOpen is isOpen and tag == cTag
      and all(attrs.get(k,None) == v for k,v in cAttrs)):
        onMatch(isOpen, tag, attrs)
        return

  def handle_starttag(self, tag, attrs):
    attrs = dict(attrs)
    self.handleTag(True, tag, attrs)

  def handle_endtag(self, tag):
    self.handleTag(False, tag, {})

def extractAattachmentNames(text):
  attachmentRe = re.compile(r'''(?<=[[{]{2}attachment:)
    [^]|}]+
    (?=[]}]{2}|[|])''', re.VERBOSE | re.IGNORECASE)
  return set(attachmentRe.findall(text))

def main():
  sslContext = getSslContext()
  if caChainFile is not None:
    sslContext.load_verify_locations(cafile=caChainFile)

  url = createSiteUrl(indexPage, False)
  print("Getting page index from {0}...".format(url))
  text = getPageText(sslContext, url)
  parser = IndexPageParser()
  parser.feed(text)
  pages = parser.pages

  # Store pages and attachments in current dir:
  slashRegExp = re.compile("/")
  pLen = len(pages)
  for pCount, pUrl in enumerate(sorted(pages), start=1):
    pName = urllib.parse.unquote(pUrl)
    pFileName = slashRegExp.sub(' - ', pName) + '.txt'
    if os.path.isfile(pFileName):
      msg = '{0}/{1} page "{2}" already exists.'
      print(msg.format(pCount, pLen, pFileName))
    else:
      pRawUrl = createSiteUrl(pUrl, True)
      print('{0}/{1} fetching page "{2}".'.format(pCount, pLen, pFileName))
      print('  <{0}>'.format(pRawUrl))
      try:
        text = getPageText(sslContext, pRawUrl)
        attachments = extractAattachmentNames(text)
        aLen = len(attachments)
        for aCount, aName in enumerate(sorted(attachments), start=1):
          tpl = "{0}.txt - {1}"
          aFileName = slashRegExp.sub(' - ', tpl.format(pName, aName))
          if os.path.isfile(aFileName):
            msg = '{0}/{1} attachment "{2}" already exists.'
            print(msg.format(aCount, aLen, aName))
          elif "/" in aName:
            msg = '{0}/{1} attachment "{2}" is a reference.'
            print(msg.format(aCount, aLen, aName))
          else:
            aUrl = createAttachmentUrl(pUrl, aName)
            msg = '{0}/{1} fetching attachment "{2}".'
            print(msg.format(aCount, aLen, aName))
            print('  <{0}>'.format(aUrl))
            try:
              aBytes = getUrlContent(sslContext, aUrl)
              writeFile(aFileName, aBytes)
            except UrlNotFound:
              print('  404 - Not found!')
        writeFile(pFileName, text)
      except UrlNotFound:
        print('  404 - Not found!')
      time.sleep(5.0) # don't hammer on the wiki!

if __name__ == "__main__":
  main()
