משתמש:I'm the cookie monster/PBYtoWiki.py

"""
This libory change HTML files from Ben Yehuda Project (https://benyehuda.org/)
to a Wiki code, that can be used in all mediawiki websites, but especially for the hebrew wikisource.
see https://he.wikipedia.org/wiki/%D7%A2%D7%96%D7%A8%D7%94:%D7%AA%D7%97%D7%91%D7%99%D7%A8_%D7%95%D7%99%D7%A7%D7%99 to more information about wiki-code.
"""
import re
import html

file_name = "C:\\Users\\efrmo\\Downloads\\" + "31020.html"
OUTPUT_NAME = "PBYtoWiki_-_Output.txt"

with open(file_name, "r", encoding="utf-8") as file:
    text = file.read()

# footnotes
def replace_foonote(match_obj: re.Match):
    global text
    num = match_obj.group("note_num")
    foonote = match_obj.group("foonote")
    if "=" in foonote:
        text = text.replace(
            f'<a href="#fn:{num}" id="fnref:{num}" title="see footnote" class="footnote"><sup>{num}</sup></a>',
            "{{הערה|1=" + foonote + "}}",
        )
    text = text.replace(
        f'<a href="#fn:{num}" id="fnref:{num}" title="see footnote" class="footnote"><sup>{num}</sup></a>',
        "{{הערה|" + foonote + "}}",
    )


re.sub(
    r'<li id="fn:(?P<note_num>\d+)">\n<p>(?P<foonote>.*?) <a href="#fnref:(?P=note_num)" title="return to body" class="reversefootnote">&#160;&#8617;</a></p>\n</li>',
    replace_foonote,
    text,
    flags=re.DOTALL,
)

# headers
text = re.sub(
    r"<h(?P<level>[123456]) id=\".*?\">(?P<header>.*?)</h(?P=level)>",
    lambda m: "=" * int(m.group("level"))
    + " "
    + m.group("header")
    + " "
    + "=" * int(m.group("level")),
    text,
)

# <strong>, <p>, <br>, atc.
text = re.sub(r"<strong>(.*?)</strong>", r"'''\1'''", text, flags=re.DOTALL)
text = re.sub(r"</?p>", "", text)
text = re.sub(r"</?blockquote>", "", text)
text = text.replace("<br />", "\n")

# tables
text = re.sub(r"</?colgroup>\n?", "", text)
text = re.sub(r"<col />\n?", "", text)
text = re.sub(r"</?thead>", "", text)
text = re.sub(r"</?tbody>", "", text)


text = re.sub(r"\n?<table>\n?", r'\n{| class="wikitable"\n', text)
text = re.sub(
    r'\n?<tr( class=".*?")??>(.*?)</tr>\n?', r"\n|- \2\n", text, flags=re.DOTALL
)
text = re.sub(r"\n?\t?<th>(.*?)</th>\n?", r"\n! \1", text)
text = re.sub(r"\n?\t?<td>(.*?)</td>\n?", r"\n| \1", text)
text = re.sub(r"\n?</table>\n?", r"\n|}\n", text)


# special characters
text = html.unescape(text)

# Change quotes, so it will be same to the regular quotes
text = text.replace("“", '"')
text = text.replace("„", '"')
text = text.replace("”", '"')

# img
text = re.sub(
    r'\n<figure>\n<img src="(.*?)" alt=".*?" />\n\n</figure>',
    r"<!-- תמונה חסרה שיש ליבא -->\n\1",
    text,
)


with open(OUTPUT_NAME, "w", encoding="utf-8") as file:
    file.write(text)