Want to get involved? We're always looking for ideas and content for Weekly Challenges.
SUBMIT YOUR IDEAWish I had the intelligence suite to try out the text analytics but I'll settle for the fun parse!
Chose poorly in parsing out subject instead of topic view href. Switched it and all fell into place.
Regex is useful in parsing data. Learning more about regex challenge-by-challenge.
Here's my solution. I'm uploading the Alteryx workflow, but the parsing happens in Python.
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from tqdm import tqdm
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
class Challenges:
def __init__(self, xl_path: str) -> None:
self.features = 'html.parser'
self.xl_path = xl_path
self.body_parsed = list()
def parse_xl(self) -> pd.DataFrame:
self.df = pd.read_excel(io=self.xl_path)
for i in tqdm(range(len(self.df['body']))):
try:
soup = BeautifulSoup(markup=self.df['body'][i], features=self.features)
self.body_parsed.append(soup.text)
except TypeError:
self.body_parsed.append('')
self.df['body_parsed'] = self.body_parsed
self.df = self.df.drop(columns=['topic view href', 'body'])
return self.df
def to_pandas(self) -> None:
self.df = self.df.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
df = pd.DataFrame(data=self.df, index=None)
df.to_excel(
excel_writer='my_test_227.xlsx',
sheet_name='227',
freeze_panes=(1,0),
index=False
)
# Instantiate class and call functions
if __name__ == '__main__':
c = Challenges(xl_path='./challenge_227.xlsx')
c.parse_xl()
c.to_pandas()