class ArticleScraper:
def __init__(self):
pass
def articleScraper(self, article_links):
article_content = []
for url in article_links:
url_i = newspaper.Article(url="%s" % (url), language='en')
url_i.download()
url_i.parse()
content = (f"TITLE:{url_i.title} ARTICLES: {url_i.text}")
print(urllib.parse.unquote(content))
article_content.append(content)
return ("\n".join(article_content))
sol = ArticleScraper()
print(sol.articleScraper(list_of_urls))
this is my current code, and the problem I'm having is that whenever it outputs the content or the text it doesn't scrape all the utf-8.
I' tried using the urllib3, and with bs4 aswell, no luck on the urllib3 on bs4 it works the encoding and decoding but I wanted to use newspaper3k because it's more efficient when scraping.
minimal working code
so we could check this problem