Excel 시트내용 검색¶

In [ ]:

# %%time
# file = 'csv/Food-data.xlsx'
# import xlrd
# xls        = xlrd.open_workbook(file)
# sheet_list = xls.sheet_names()[4:-8]

# import pandas as pd
# result, no = {}, 0
# for sheet in sheet_list:
#     df = pd.read_excel(file, sheet_name=sheet)
#     df = df.fillna(method='ffill')
#     df = df.loc[:,['메뉴명', '단품명', '1인량']]
#     no += 1
#     if no % 10 == 0: 
#         print('{:4,} th sheet working'.format(no))
#     # 컬럼의 오타를 변경
#     df.메뉴명 = list(map(lambda x : x.replace('\n','') , list(df.메뉴명)))  
#     temps   = {}
#     for menu in list(set(df.메뉴명)):
#         df_temp = df[df.메뉴명 == menu]
#         foods   = {df_temp.iloc[idx,1]: df_temp.iloc[idx,2]  for idx in range(len(df_temp.단품명))}
#         temps[menu] = foods
#     for menu in temps.keys():
#         # 기존에 없던 메뉴가 확인시
#         if menu not in result.keys():
#             result[menu] = temps[menu]
#         # 메뉴가 있으면 레시피만 추가
#         else:
#             # 재료가 없으면 재료이름과 해당 내용을 추라
#             for recipie in temps[menu].keys():
#                 if recipie not in result[menu].keys():
#                     result[menu][recipie] = temps[menu][recipie]
# import json
# data = json.dumps(result, ensure_ascii=False)
# with open('food_recipie.json', 'w', encoding='UTF-8-sig') as file:
#      file.write(data)

메뉴 데이터를 Python Pandas 로 불러오기

1 Read xls to Pandas¶

메뉴 데이터를 Python Pandas 로 불러오기

공학도를 위한 파이썬 | OpenpyXl
후처리 편의성을 위해선 Pandas 가 유용하다고 판단

In [ ]:

file = 'data/2018_muyong_menu.xlsx'

import xlrd
xls        = xlrd.open_workbook(file)
sheet_list = xls.sheet_names()
len(sheet_list), sheet_list[::10]

# from openpyxl import Workbook, load_workbook
# wb        = load_workbook(file_name)
# st_names  = [ws.title  for ws in wb.worksheets]`

In [3]:

# 대표적인 Sheet 내용 살펴보기
import pandas as pd
df = pd.read_excel(file, sheet_name=sheet_list[3],header=None)
df.head()

Out[3]:

	0	1	2	3	4	5	6
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	월 ( 1월22일)	화 (23일 )	수 (24일 )	목 (25일 )	금 (26일)
2	NaN	NaN	청양콩나물국	조갯살미역	설렁탕/소면	참치김치찌개	훈제오리야채볶음밥
3	NaN	NaN	돈육맥적구이	허니버터치킨	탕수육	제육고추장불고기	돈까스/소스
4	NaN	중	마파두부	난자완스	오징어어묵볶음	생선까스튀김	어묵국

2 Pre Processing¶

line 별 작업결과를 바탕으로 정리하기

Pandas to Series

In [4]:

# 작업의 시작
table_row = df.iloc[1,:]
list(table_row.values)

Out[4]:

[nan, nan, '월 ( 1월22일)', '화 (23일 )', '수 (24일 )', '목 (25일 ) ', '금 (26일)']

In [5]:

# notnull() : True Value Count
table_row.notnull().tolist()

Out[5]:

[False, False, True, True, True, True, True]

In [6]:

# isna() : Null Count
table_row.isna().tolist()

Out[6]:

[True, True, False, False, False, False, False]

In [7]:

table_row.dropna().tolist()

Out[7]:

['월 ( 1월22일)', '화 (23일 )', '수 (24일 )', '목 (25일 ) ', '금 (26일)']

In [8]:

" ".join(table_row.dropna().tolist())

Out[8]:

'월 ( 1월22일) 화 (23일 ) 수 (24일 ) 목 (25일 )  금 (26일)'

In [9]:

import re
tokens = re.findall(r"(월|화|수|목|금)", 
                    " ".join(table_row.dropna().tolist()))
print(tokens)
if len(tokens) >= 4:
    print("주간 데이터 작업시작")

['월', '월', '화', '수', '목', '금']
주간 데이터 작업시작

3 점심/ 저녁 메뉴목록 수집하기¶

line 별 작업결과를 바탕으로 정리하기

Pandas to Series

In [10]:

# 작업의 시작
table_row = df.iloc[2,:]
list(table_row.values)

Out[10]:

[nan, nan, '청양콩나물국', '조갯살미역', '설렁탕/소면', '참치김치찌개', '훈제오리야채볶음밥']

In [11]:

# 작업의 시작
table_row = df.iloc[12,:]
list(table_row.values)

Out[11]:

[nan, '식수', nan, nan, nan, nan, nan]

In [12]:

# notnull() : True Value Count
sum(table_row.isna().tolist())

Out[12]:

In [13]:

re.findall(r"식수", " ".join(list(table_row.dropna())))

Out[13]:

['식수']

In [14]:

re.findall(r"식수", "알고리즘 수수수")

Out[14]:

[]

4 세부 수집기 작업하기¶

pandas 와 regex 활용 https://stackoverflow.com/questions/52173161/getting-a-list-of-indices-where-pandas-boolean-series-is-true

In [15]:

import xlrd, re
import pandas as pd

# xls 파일에서 sheet list 추출
def menu_sheet_names(file):
    xls        = xlrd.open_workbook(file)
    return file, xls.sheet_names()

# 날짜 인덱스로 검색여부 확인
def menu_valid_check(file, sheet_list, sheet_num):
    df = pd.read_excel(file, sheet_name=sheet_list[sheet_num],header=None)
    for _ in range(df.shape[0]): 
        tokens = re.findall(r"(월|화|수|목|금)", 
                        " ".join(df.iloc[_,:].dropna().tolist()))
        if len(tokens) >= 4:
            #print("주간 데이터 작업시작")
            return df, _ #break
    print(sheet_num, "작업할 내용이 없습니다")
    return df, None

In [16]:

# 유효값 데이터 추출하기
def menu_valid_data(df, col_start):

    # valid sector 추출함수
    def menu_filter(df, rows, cols, col_name):
        menu = df.iloc[rows[0]:rows[1], cols]
        menu.columns = col_name
        return menu.reset_index(drop=True)

    # col_start : 요일값 인덱스 주소
    # col_valid : 요일값 포함된 컬럼들
    # col_data : 요얄값 컬럼 중 12개 이상 값이 있는 컬럼 만 추출
    col_valid = list(df.iloc[col_start,:].dropna().index)
    col_data = [ _ for _ in col_valid  if sum(df.iloc[:,_].notnull()) > 12] 
    col_name = df.iloc[col_start, col_data].tolist()
    col_name = [_.strip().replace(" ","") for _ in col_name]

    # 요일별 12개 이상 유효값 있는 필드 인덱스 추출
    row_valid = df.iloc[:, col_data[0]].isna()
    row_valid = list(row_valid[row_valid].index)[:3]
    row_valid[0] = col_start
    limit_row = [ [row_valid[0]+1, row_valid[1]], [1+row_valid[1], row_valid[2]-1] ]
    result = []
    for no,_ in enumerate(limit_row):
        midname = {0:"중식", 1:"석식"}
        menu_temp = menu_filter(df, _, col_data, col_name)
        result += [(_,midname[no],";".join(menu_temp[_]))    for _ in list(menu_temp.columns)]
    return result

In [17]:

file = 'data/2018_muyong_menu.xlsx'
file_name, sheet_list = menu_sheet_names(file)
result = []
from tqdm import tqdm
for _ in tqdm(range(len(sheet_list))):
    table_sheet, i = menu_valid_check(file_name, sheet_list, _)
    if i:
        try: result += menu_valid_data(table_sheet, i)
        except: pass
menu_table = pd.DataFrame(result)
print(menu_table.shape)
menu_table.head(3)

100%|██████████| 53/53 [00:04<00:00, 12.51it/s]

52 작업할 내용이 없습니다
(462, 3)

Out[17]:

	0	1	2
0	화(2일)	중식	사골우거지해장국;제육간장불고기;꽃게양념무침;두부구이&양념장;양상추샐러드/흑임자D;구...
1	수(3일)	중식	소고기미역국;묵은지닭찜;오징어까스&어니언소스;어묵간장볶음;멕시칸샐러드;비빔막국수;봄...
2	목(4일)	중식	얼갈이된장국;돈육맥적구이;해물누룽지탕;건파래실치볶음;양상추샐러드/키위D;진미채야채초...

Excel 시트내용 검색¶

Menu List Rerange¶

1 Read xls to Pandas¶

2 Pre Processing¶

3 점심/ 저녁 메뉴목록 수집하기¶

4 세부 수집기 작업하기¶