Issue
i am new to python and i am yet to learn the concept of oop,classes with python. i thought i understood functions. But i am facing issue while calling functions from different py file.
Below code shows all my fuctions described in main.py
i want to split main.py and get 2 other py files as data extraction.py and data processing.py
i understand that it can be done using classes, but can we do it without using classes as well?
i divided the code in two other files but i am getting error(please find my attached screenshot)
please explain me what i can do here!
main.py
import pandas as pd
import requests
from bs4 import BeautifulSoup
from configparser import ConfigParser
import logging
import data_extraction
config = ConfigParser()
config.read('config.ini')
logging.basicConfig(filename='logfile.log', level=logging.DEBUG,
format='%(asctime)s:%(lineno)d:%(name)s:%(levelname)s:%(message)s')
baseurl = config['configData']['baseurl']
sub_url = config['configData']['sub_url']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
r = requests.get(baseurl, headers=headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
model_links = []
all_keys = ['Model', 'Platform', 'Product Family', 'Product Line', '# of CPU Cores',
'# of Threads', 'Max. Boost Clock', 'Base Clock', 'Total L2 Cache', 'Total L3 Cache',
'Default TDP', 'Processor Technology for CPU Cores', 'Unlocked for Overclocking', 'CPU Socket',
'Thermal Solution (PIB)', 'Max. Operating Temperature (Tjmax)', 'Launch Date', '*OS Support']
# function to get the model links in one list from soup object(1st page extraction)
def get_links_in_list():
for model_list in soup.find_all('td', headers='view-name-table-column'):
# model_list = model_list.a.text - to get the model names
model_list = model_list.a.get('href')
# print(model_list)
model_list = sub_url + model_list
# print(model_list)
one_link = model_list.split(" ")[0]
model_links.append(one_link)
return model_links
model_links = get_links_in_list()
logging.debug(model_links)
each_link_data = data_extraction()
print(each_link_data)
#all_link_data = data_processing()
#write_to_csv(all_keys)
data_extraction.py
import requests
from bs4 import BeautifulSoup
from main import baseurl
from main import all_keys
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
"Upgrade-Insecure-Requests": "1", "DNT": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
r = requests.get(baseurl, headers=headers)
status = r.status_code
soup = BeautifulSoup(r.content, 'html.parser')
model_links = []
# function to get data for each link from the website(2nd page extraction)
def data_extraction(model_links):
each_link_data = []
try:
for link in model_links:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
specification = {}
for key in all_keys:
spec = soup.select_one(
f'.field__label:-soup-contains("{key}") + .field__item, .field__label:-soup-contains("{key}") + .field__items .field__item')
# print(spec)
if spec is None:
specification[key] = ''
if key == 'Model':
specification[key] = [i.text for i in soup.select_one('.page-title')]
specification[key] = specification[key][0:1:1]
# print(specification[key])
else:
if key == '*OS Support':
specification[key] = [i.text for i in spec.parent.select('.field__item')]
else:
specification[key] = spec.text
specification['link'] = link
each_link_data.append(specification)
except:
print('Error occurred')
return each_link_data
# print(each_link_data)
data processing.py
# function for data processing : converting the each link object into dataframe
def data_processing():
all_link_data = []
for each_linkdata_obj in each_link_data:
# make the nested dictionary to normal dict
norm_dict = dict()
for key in each_linkdata_obj:
if isinstance(each_linkdata_obj[key], list):
norm_dict[key] = ','.join(each_linkdata_obj[key])
else:
norm_dict[key] = each_linkdata_obj[key]
all_link_data.append(norm_dict)
return all_link_data
# print(all_link_data)
all_link_data = data_processing()
# function to write dataframe data into csv
def write_to_csv(all_keys):
all_link_df = pd.DataFrame.from_dict(all_link_data)
all_link_df2 = all_link_df.drop_duplicates()
all_link_df3 = all_link_df2.reset_index()
# print(all_link_df3)
all_keys = all_keys + ['link']
all_link_df4 = all_link_df3[all_keys]
# print(all_link_df4)
all_link_df4.to_csv('final_data.csv')
write_to_csv(all_keys)
Solution
Move the existing functions(ex. write_to_csv) to different file for example ‘utility_functions.py’. Import it in main.py using from utility_functions import write_to_csv
. Now you can use the function ‘write_to_csv’ in main.py as
write_to_csv(all_keys)
Edit
In the main.py
file
use from data_extraction import data_extraction
instead of import data_extraction
In data_extraction.py
file
Remove lines
from main import baseurl from main import all_keys
It will throw variable undefined error, you can fix it by passing the variable in the function call.
Answered By – Manish Shetty
Answer Checked By – Marilyn (BugsFixing Volunteer)