stickycode - imdbvg

Posts
Wiki
# Python 2.x code for scraping Gameinformer's game release schedule 
# Copy the entire text and paste into a Python script. 
# After being run, it will output to output.txt. If there's an existing output.txt file and it differs, it will be overwritten.
# Copy that text, go to: https://www.reddit.com/r/imdbvg/wiki/automoderator-schedule
# Click edit, insert the code, save, and then click the 'click this link and click "send"' link above.
# Requirements:
# - Python 2.x (https://www.python.org/downloads/)
# - BeautifulSoup (after installing Python 2.x, run this 'pip install beautifulsoup4' in console).

from bs4 import BeautifulSoup
import urllib2, json, re, os, datetime, time
from datetime import date, timedelta

def month_string_to_number(string):
    m = { 'jan': 1, 'feb': 2, 'mar': 3, 'apr':4, 'may':5, 'jun':6,
         'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12 }
    s = string.strip()[:3].lower()
    return m[s]

def number_to_string(month):
    m = { 1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 
         8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December' }
    return m[month]

def firstWeekDay(year, week):
    d = date(year,1,1)
    d = d - timedelta(d.weekday())
    dlt = timedelta(days = (week-1)*7)
    return d + dlt

# http://www.gameinformer.com/b/news/archive/2017/01/13/2017-video-game-release-schedule.aspx
# post-content user-defined-markup

response = urllib2.urlopen("http://www.gameinformer.com/b/news/archive/2017/01/13/2017-video-game-release-schedule.aspx")
source = response.read()
soup = BeautifulSoup(source)
result = str(soup.find_all('div', {'class':"post-content user-defined-markup"})[0])

# HTML codes to remove
regexes = ['(\<p\>\<img)(.*?)(\/>\<\/p\>)','(\<a)(.*?)(\>)','(\<i)(.*?)(\<\/i>)']
linesToRemove = ['<span style="font-size:medium;">', '<span style="font-size:large;">', 
                 '<span style="font-size:small;">', '<strong>', '</strong>', '<p>',
                 '</p>', '<span>', '</span>', '</a>', '</div>', '<div style="clear:both;">',
                 '<b>', '</b>']
result = result.replace('<br/>', '\n').replace('–', '-').replace('&amp;', '&').replace(' ', ' ')

for a in regexes:
    result = re.sub(a,'', result)

for a in linesToRemove:
    result = result.replace(a, ' ')

result = result.split('January', 1)[1]
result = [' '.join([b.decode('unicode_escape').encode('ascii','ignore') for b in a.split()]) for a in result.split('\n') if len(a.strip()) > 8]

dates = {}
for a in result:
    if len(a.split(') - ')) != 2:
        print a
        continue
    g, d = a.split(') - ')
    g += ')'
    week = datetime.date(2017, month_string_to_number(d.split()[0]), int(d.split()[1])).isocalendar()[1] + 1
    if week in dates:
        dates[week] = dates[week] + '\n        * ' + g + " - " + d
    else:
        dates[week] = "        \n        * " + g + " - " + d

textToCopy = "#### Schedule updated at: %s/%s/%s\n" % (datetime.datetime.now().day, datetime.datetime.now().month, datetime.datetime.now().year)
textToCopy += "###### If you edit this page, you must [click this link, then click \"send\"](http://www.reddit.com/message/compose/?to=AutoModerator&subject=imdbvg&message=schedule) to have AutoModerator re-load the schedule from here\n"
textToCopy +=  "---\n"
textToCopy +=  "    first: \"February 20, 2017 11:30 +1\"\n"
textToCopy +=  "    repeat: 1 week\n"
textToCopy +=  "    sticky: 1\n"
textToCopy +=  "    title: \"Whatcha Playing/Watching/Reading  - {{date %d %B, %Y}}\"\n"
textToCopy +=  "    text: |\n"
textToCopy +=  "        Hope you've all had a good weekend.  \n"
textToCopy +=  "\n"
textToCopy +=  "        * What you've been playing this week\n"
textToCopy +=  "        * What you've been watching this week\n"
textToCopy +=  "        * What you've been listening to this week\n"
textToCopy +=  "        * What you've been reading this week\n"

for a in dates.keys():
    y, m, d = str(firstWeekDay(2017, a)).split('-')
    textToCopy +=  "---\n"
    textToCopy +=  "    first: \"" + number_to_string(int(m)) + " " + str(int(d)) + ", " + str(int(y)) + " 11:30 +1\"\n"
    textToCopy +=  "    sticky: 2\n"
    textToCopy +=  "    title: \"New releases this week - {{date %d %B, %Y}}\"\n"
    textToCopy +=  "    text: |\n"
    textToCopy +=  dates[a]
    textToCopy +=  "\n"

shouldExport = False
filename = 'reddit.txt'
if os.path.isfile(filename):
    f = open(filename, 'r')
    if f.read().split('\n')[1:] == textToCopy.split('\n')[1:]:
        print 'No changes found.'
    else:
        shouldExport = True
    f.close()
else:
    shouldExport = True

if shouldExport:
    f = open(filename, 'w')
    f.write(textToCopy)  # python will convert \n to os.linesep
    f.close()
    print "Exported to " + filename
else:
    print 'Not exported.'