# Step 1. Matplotlib 업그레이드
!pip install matplotlib -U
!pip install seaborn -U
!pip install pandas -U
# Step 2. 한글 설치 및 사용 설정
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
# Step 3. 셀 실행 후 런타임 재시작
# Step 4. 한글 글꼴 설정
import matplotlib.pyplot as plt
plt.rcParams['font.family']=['NanumGothic', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False
# 한글 설정 확인
%matplotlib inline
fig, ax = plt.subplots(figsize=(10,3))
ax.text(0.5, 0.5, "한글 테스트")
plt.show()
import numpy as np
import pandas as pd
import seaborn as sns
import requests
from copy import deepcopy
sns.set_context("talk")
sns.set_style("whitegrid")
pd.options.display.max_columns=50
# seaborn 설정에 의해 파괴되는 한글 설정을 재설정
plt.rcParams['font.family']=['NanumGothic', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False
# KOBIS OPEN API 서비스에서 키 발급 필요 : http://www.kobis.or.kr/kobisopenapi/homepg/main/main.do
APIKEY = '본인의_API_KEY'
# 영화 데이터 보관 디렉토리
!mkdir data
%%time
# 영화 목록 다운로드
url = 'http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json'
cols_movielist = ['movieCd', 'movieNm', 'movieNmEn', 'prdtYear', 'openDt', 'typeNm', 'prdtStatNm', 'nationAlt', 'genreAlt', 'repNationNm', 'repGenreNm', 'directors', 'companys', 'openYear']
df_movielist_raw = pd.DataFrame(columns=cols_movielist)
for p in range(1000):
# 영화개봉년도 1961~2021 검색. 한 페이지에 100개 아이템, 총 900페이지 = 최대 10만개 데이터
r = requests.get(url, params={'key':APIKEY, 'openStartDt':'1960', 'openEndDt':'2021', 'itemPerPage':'100', 'curPage':str(p+1)})
df_ = pd.DataFrame(r.json()['movieListResult']['movieList'])
df_movielist_raw = pd.concat([df_movielist_raw, df_], axis=0)
df_movielist_raw.to_pickle("./data/df_movielist_raw.pkl")
# 제작년도 결측치 처리, 타입 변환
df_movielist_raw.loc[df_movielist_raw["prdtYear"] == "", "prdtYear"] = np.nan
df_movielist_raw.loc[~df_movielist_raw["prdtYear"].isna(), "prdtYear"] = df_movielist_raw.loc[~df_movielist_raw["prdtYear"].isna()]["prdtYear"].astype(int)
# 개봉년도 추출, 타입 변환
df_movielist_raw["openYear"]=df_movielist_raw["openDt"].apply(lambda x: x[:4])
df_movielist_raw["openYear"]=df_movielist_raw["openYear"].astype(int)
# 개봉일자 타입 변환
df_movielist_raw["openDt"]=df_movielist_raw["openDt"].astype(int)
# 개봉일자 기준 정렬
df_movielist_raw.sort_values("openDt")
df_movielist_raw.reset_index(drop=True, inplace=True)
# 파일 저장
df_movielist_raw.to_pickle("./data/df_movielist_raw.pkl")
# API 활용 다운로드에 15분 가량 소요 (Colab 기준)
# 본인 google drive 등을 활용해 데이터 백업 권장.
df_movielist_raw = pd.read_pickle("./data/df_movielist_raw.pkl")
df_movielist
¶df_movielist_raw = pd.read_pickle("./data/df_movielist_raw.pkl")
# 1971년 이후로 범위 제한
df_movielist = df_movielist_raw.query("1971 <= openYear <= 2020")
# 개봉영화로 범위 제한
df_movielist = df_movielist.query("prdtStatNm == '개봉'")
print(df_movielist.shape)
df_movielist.head()
# 대표국가 없는 영화 제거
df_movielist = df_movielist.drop(df_movielist.query("repNationNm == ''").index)
df_movielist.shape
# 대표국가별 영화 수 확인
df_movielist["repNationNm"].value_counts()
# 데이터 백업
df_movielist.to_pickle("./data/df_movielist.pkl")
df_nations
¶# 별도 데이터셋 작성
df_nations = df_movielist[["movieCd", "movieNm", "openYear", "repNationNm"]]
df_nations.dropna(subset=["repNationNm"], inplace=True)
# 영화 제작국가 정리
nations = np.unique(','.join(df_nations.loc[df_nations["repNationNm"] != ""]['repNationNm']).split(','))
print(f"{len(nations)} Nations: {nations}")
# 국가별 one-hot encoding
for nation in nations:
df_nations[f"N_{nation}"] = df_nations["repNationNm"].str.contains(nation).astype('int')
df_nations.tail()
# 해외영화를 따로 분류
df_nations["해외"] = 0
df_nations.loc[df_nations["repNationNm"] != "한국", "해외"] = 1
df_nations.drop("repNationNm", axis=1, inplace=True)
# 한국영화와 해외영화 수 확인
print("# 한국 : ", df_nations["N_한국"].sum())
print("# 해외: ", df_nations["해외"].sum())
# 개봉작 수 상위 20개국 추출
df_nations_top20 = df_nations.sum().drop(["openYear", "해외", "movieCd", "movieNm"]).sort_values(ascending=False)[:20]
df_nations_top20
# 년도별 개봉 편 수 계산
df_nationsY = df_nations.groupby("openYear").sum()
df_nationsY.reset_index(inplace=True)
df_nationsY.head()
# 데이터 백업
df_nations.to_pickle("./data/df_nations.pkl")
df_nationsY.to_pickle("./data/df_nationsY.pkl")
# 1987년까지 영화 개봉 현황
fig, ax = plt.subplots(figsize=(10, 3))
df_nationsY.query("openYear < 1988")[["openYear", "N_한국", "해외"]].plot.bar(x="openYear", stacked=True, ax=ax)
# 1988-2010 개봉 한국영화
df_nations.query("1988 <= openYear <=2010")["N_한국"].sum()
fig, ax = plt.subplots(figsize=(10, 3))
sns.barplot(x="openYear", y="N_한국", data=df_nationsY.query("1988 <= openYear <=2010"), ax=ax)
# 1988-2010 개봉 한국영화 연평균
df_nations.query("1988 <= openYear <=2010")["N_한국"].sum()/23
# 1988-2010 개봉 해외영화
df_nations.query("1988 <= openYear <=2010")["해외"].sum()
fig, ax = plt.subplots(figsize=(10, 3))
sns.barplot(x="openYear", y="해외", data=df_nationsY.query("1988 <= openYear <=2010"), ax=ax)
# 1988-2010 개봉 홍콩영화
df_nations.query("1988 <= openYear <=2010")["N_홍콩"].sum()
fig, ax = plt.subplots(figsize=(10, 3))
sns.barplot(x="openYear", y="N_홍콩", data=df_nationsY.query("1988 <= openYear <=2010"), ax=ax)
# 1988-2010 개봉 해외영화 연평균
df_nations.query("1988 <= openYear <=2010")["해외"].sum()/23
# 1998년 한국영화 개봉작 수
df_nationsY.query("openYear == 1998")["N_한국"]
# 개봉 영화 제작 국가
fig, ax = plt.subplots(figsize=(10, 3))
ax.bar(df_nationsY["openYear"], (df_nationsY.filter(like="N_") > 0).sum(axis=1))
# 전체 범위에서 가장 다양한 나라의 영화가 개봉한 해
df_nationsY["openYear"][(df_nationsY.filter(like="N_") > 0).sum(axis=1).argmax()]
# 그 해에 개봉된 영화의 제작 국가 수
(df_nationsY.filter(like="N_") > 0).sum(axis=1).max()
# 1987년 이전까지 가장 다양한 나라의 영화가 개봉됐을 때 제작 국가 수
(df_nationsY.query("openYear < 1989").filter(like="N_") > 0).sum(axis=1).max()
# 일본영화 상영 편 수
fig, ax = plt.subplots(figsize=(10, 3))
sns.barplot(x="openYear", y="N_일본", data=df_nationsY.query("2000 <= openYear"), ax=ax)
# OTT 서비스 이후 해외와 미국 영화 상영 편 수
fig, ax = plt.subplots(figsize=(10, 3))
df_nationsY.query("2010 <= openYear ")[["openYear", "해외"]].plot.bar(x="openYear", ax=ax)
df_nationsY.query("2010 <= openYear ")[["openYear", "N_미국"]].plot.bar(x="openYear", ax=ax, color="C1")
# OTT 서비스 이후 해외와 한국 영화 상영 편 수
fig, ax = plt.subplots(figsize=(10, 3))
df_nationsY.query("2010 <= openYear ")[["openYear", "해외"]].plot.bar(x="openYear", ax=ax, color="C0")
df_nationsY.query("2010 <= openYear ")[["openYear", "N_한국"]].plot.bar(x="openYear", ax=ax, color="C2")
# 설국열차, 미나리는 한국영화? : 미나리는 미국 영화. 2021년이라 범위 밖.
df_movielist_raw.query("movieNm == '미나리'")
# 설국열차, 미나리는 한국영화? : 설국열차는 한국영화
df_nations.query("movieNm == '설국열차'")
df_genres
¶# 장르 데이터셋 구축
df_genres = df_movielist[["movieCd", "movieNm", "openYear", "genreAlt"]]
df_genres.dropna(subset=["genreAlt"], inplace=True)
# 영화 장르 파악
genres = np.unique(','.join(df_genres.loc[df_genres["genreAlt"] != ""]['genreAlt']).split(','))
genres = [g.split("(")[0] for g in genres]
print(f"{len(genres)} Genres: {genres}")
# one-hot encoding
for genre in genres:
df_genres[f"G_{genre}"] = df_genres["genreAlt"].str.contains(genre.split("(")[0]).astype('int')
df_genres.drop("genreAlt", axis=1, inplace=True)
df_genres.tail()
# 기타 장르 영화 수 확인
df_genres["G_기타"].sum()
# 아무 장르도 없는 영화를 기타로 처리
df_genres.loc[df_genres.filter(like="G_").sum(axis=1) == 0, "G_기타"] = 1
df_genres["G_기타"].sum()
# 장르가 2개 이상인 영화를 복합장르로 지정
# 복합장르
df_genres["복합장르"] = 0
df_genres.loc[df_genres.filter(like="G_").sum(axis=1) > 1, "복합장르"] = 1
df_genres.iloc[df_genres.filter(like="G_").sum(axis=1).argmax()]
# 장르 순위 데이터셋
df_genres_top21 = df_genres.filter(like="G_").sum().sort_values(ascending=False)[:21]
df_genres_top21
# 년도별 장르 데이터셋 구축
df_genresY = df_genres[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY.drop("movieCd", axis=1, inplace=True)
df_genresY.fillna(0, inplace=True)
df_genresY.head()
# 데이터 백업
df_genres.to_pickle("./data/df_genres.pkl")
df_genresY.to_pickle("./data/df_genresY.pkl")
# 영화 데이터
df_genres_kr = df_genres.loc[df_nations["N_한국"]==1]
df_genres_kr.head()
# 년간 데이터
df_genresY_kr = df_genres_kr[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_kr.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_kr.drop("movieCd", axis=1, inplace=True)
df_genresY_kr.fillna(0, inplace=True)
df_genresY_kr.head()
# 데이터 백업
df_genres_kr.to_pickle("./data/df_genres_kr.pkl")
df_genresY_kr.to_pickle("./data/df_genresY_kr.pkl")
# 영화 데이터
df_genres_nkr = df_genres.loc[df_nations["해외"]==1]
df_genres_nkr.head()
# 년간 데이터
df_genresY_nkr = df_genres_nkr[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_nkr.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_nkr.drop("movieCd", axis=1, inplace=True)
df_genresY_nkr.fillna(0, inplace=True)
df_genresY_nkr.head()
# 데이터 백업
df_genres_nkr.to_pickle("./data/df_genres_nkr.pkl")
df_genresY_nkr.to_pickle("./data/df_genresY_nkr.pkl")
# 한국 영화 편당 평균 장르 수
df_genres.filter(like="G_").sum(axis=1).loc[df_nations["N_한국"]==1].sum()/df_nations["N_한국"].sum()
# 한국 영화 장르별 비중
genres_portions_kr = df_genres.loc[df_nations["N_한국"]==1].filter(like="G_").sum().sort_values(ascending=False)/df_genres.loc[df_nations["N_한국"]==1].filter(like="G_").sum().sum()
patches, texts = plt.pie(genres_portions_kr)
plt.legend(patches, labels=genres_portions_kr.index, ncol=3, fontsize="xx-small")
# 해외 영화 편당 평균 장르
df_genres.filter(like="G_").sum(axis=1).loc[df_nations["해외"]==1].sum()/df_nations["해외"].sum()
# 해외 영화 장르별 비중
genres_portions_kr = df_genres.loc[df_nations["해외"]==1].filter(like="G_").sum().sort_values(ascending=False)/df_genres.loc[df_nations["해외"]==1].filter(like="G_").sum().sum()
patches, texts = plt.pie(genres_portions_kr)
plt.legend(patches, labels=genres_portions_kr.index, ncol=3, fontsize="xx-small")
# 해외 영화 장르별 비중
df_genres.loc[df_nations["해외"]==1].filter(like="G_").sum().sort_values()/df_genres.loc[df_nations["해외"]==1].filter(like="G_").sum().sum()
# 성인영화 장르가 드라마로 되어 있진 않은지: '애마부인 2016'은 성인물로 분류되지 않음.
df_genres.loc[df_genres["movieNm"].str.contains("애마")]
# 아동용 애니메이션은 포함되어 있는지
df_genres.loc[df_genres["movieNm"].str.contains("영구")]
# 아동용 애니메이션은 포함되어 있는지
df_genres.loc[df_genres["movieNm"].str.contains("우뢰매")]
# 이미지 저장 폴더
!mkdir images
# 국가별 색상코드
c_kr = "darkblue" # 한국
c_etc = "0.7" # 기타, 해외
c_it = "g" # 이탈리아
c_cn = "darkred" # 중국
c_fr = "gold" # 프랑스
c_hk = "orangered" # 홍콩
c_gb = "orchid" # 영국
c_jp = "thistle" # 일본
c_us = "mediumpurple" # 미국
fig, axs = plt.subplots(ncols=2, figsize=(10, 8), constrained_layout=True)
# 국가
blues_r = plt.get_cmap("Blues_r")
sns.barplot(x=df_nations_top20.values, y=df_nations_top20.index, ax=axs[0])
for i, p in enumerate(axs[0].patches):
c = blues_r(i/20)
p.set_facecolor(c)
for i, y in enumerate(df_nations_top20.index):
val = df_nationsY[y].sum()
if val > 2000:
axs[0].text(val-100, i, str(val), c="w", fontsize="small", fontweight="bold", ha="right", va="center")
else:
axs[0].text(val+30, i, str(val), c=blues_r(i/30), fontsize="small", fontweight="bold", ha="left", va="center")
axs[0].set_yticklabels([n.split("_")[1] for n in df_nations_top20.index], fontweight="bold")
font_title = {"fontweight":"bold", "color":"0.4"}
axs[0].set_title("제작 국가 (top 10 of 74)", fontdict=font_title, pad=16)
# pie plot
ax_pie0 = fig.add_axes([0.15, 0.35, 0.4, 0.4])
ax_pie0.pie(df_nations_top20.values)
for i, (p, v) in enumerate(zip(ax_pie0.patches, df_nations_top20.values)):
# 안수빈, 황준원, 김영도님 의견 반영하여 수정. 감사합니다 :)
r = 1-0.1*i if i < 10 else 0
p.set_radius(r)
p.set_facecolor(blues_r(i/20))
ax_pie0.text(0.25, 0.3, f"한국\n{df_nations_top20['N_한국']/df_nations_top20.sum()*100:.0f}%",
fontsize="small", color="w", fontweight="bold", ha="center")
ax_pie0.text(-0.5, 0, f"미국\n{df_nations_top20['N_미국']/df_nations_top20.sum()*100:.0f}%",
fontsize="x-small", color="w", fontweight="bold", ha="center")
ax_pie0.text(-0.2, -0.38, f"일본",
fontsize="xx-small", color="w", fontweight="bold", ha="center")
ax_pie0.text(-0.2, -0.55, f"{df_nations_top20['N_일본']/df_nations_top20.sum()*100:.0f}%",
fontsize="xx-small", color="w", fontweight="bold", ha="center")
# 장르
gist_earth = plt.get_cmap("gist_earth")
sns.barplot(x=df_genres_top21.values, y=df_genres_top21.index, ax=axs[1])
for i, p in enumerate(axs[1].patches):
c = gist_earth(i/35+0.5)
p.set_facecolor(c)
p.set_edgecolor("k")
p.set_linewidth(0.2)
for i, y in enumerate(df_genres_top21.index):
val = df_genres_top21[y].sum()
if val > 4000:
axs[1].text(val-100, i, str(val), c="w", fontsize="small", fontweight="bold", ha="right", va="center")
else:
axs[1].text(val+50, i, str(val), c=gist_earth(i/55+0.5), fontsize="small", fontweight="bold", ha="left", va="center")
axs[1].set_yticklabels([n.split("_")[1] for n in df_genres_top21.index], fontweight="bold")
axs[1].set_title("장르 (21 of 21)", fontdict=font_title, pad=16)
# spines
for ax in axs:
ax.spines[["top", "right", "bottom"]].set_visible(False)
# pie plot
ax_pie1 = fig.add_axes([0.65, 0.35, 0.4, 0.4])
ax_pie1.pie(df_genres_top21.values)
for i, (p, v) in enumerate(zip(ax_pie1.patches, df_genres_top21.values)):
r = 1 - 0.1*i if i < 10 else 0
p.set_radius(r)
p.set_facecolor(gist_earth(i/35+0.5))
ax_pie1.text(0.37, 0.28, f"드라마\n{df_genres_top21['G_드라마']/df_genres_top21.sum()*100:.0f}%",
fontsize="small", color="w", fontweight="bold", ha="center")
ax_pie1.text(-0.23, 0.27, f"멜로/\n로맨스\n{df_genres_top21['G_멜로/로맨스']/df_genres_top21.sum()*100:.0f}%",
fontsize="x-small", color="w", fontweight="bold", ha="center")
ax_pie1.text(-0.55, 0.13, f"액션",
fontsize="xx-small", color="w", fontweight="bold", ha="center")
ax_pie1.text(-0.4, 0, f"{df_genres_top21['G_액션']/df_genres_top21.sum()*100:.0f}%",
fontsize="xx-small", color="w", fontweight="bold", ha="center")
fig.savefig("./images/overview.png", dpi=200)
fig, ax = plt.subplots(figsize=(10, 5), constrained_layout=True)
# 장르 수
n_genres_kr = df_genres.filter(like="G_").sum(axis=1).loc[df_nations["N_한국"]==1].sum()
n_genres_nkr = df_genres.filter(like="G_").sum(axis=1).loc[df_nations["해외"]==1].sum()
ax.bar(0.02, n_genres_kr,
width=0.4, fc="w", ec="0.2")
ax.bar(1.02, n_genres_nkr,
width=0.4, fc="w", ec="0.2")
# 영화 수
n_movies_kr = df_nations["N_한국"].sum()
n_movies_nkr = df_nations["해외"].sum()
ax.bar(0, n_movies_kr,
width=0.4, fc=c_kr, ec="0.3")
ax.bar(1, n_movies_nkr,
width=0.4, fc=c_etc, ec="0.3")
# text
offset = 300
ax.text(0, n_genres_kr-offset, format(n_genres_kr, ","),
fontsize="medium", ha="center", va="top", color="k")
ax.text(1, n_genres_nkr-offset, format(n_genres_nkr, ","),
fontsize="medium", ha="center", va="top", color="k")
ax.text(0, n_movies_kr-offset, format(n_movies_kr, ","),
fontsize="medium", ha="center", va="top", color="w")
ax.text(1, n_movies_nkr-offset, format(n_movies_nkr, ","),
fontsize="medium", ha="center", va="top", color="w")
ax.text(0, n_genres_kr+offset*3, f"한국영화\n{n_genres_kr/n_movies_kr:.2f} 장르/편",
fontweight="bold", ha="center", va="bottom")
ax.text(1, n_genres_nkr+offset*3, f"해외영화\n{n_genres_nkr/n_movies_nkr:.2f} 장르/편",
fontweight="bold", ha="center", va="bottom")
### legend
p = 0.7
# 총 개봉 편수
ratio_num = n_movies_nkr/n_movies_kr
h_num = p*n_movies_kr/2 + (1-p)*n_movies_nkr/2
ax.text(0.5, h_num,
f"총 개봉 편 수\n", color="0.4", fontweight="bold", ha="center", va="center", zorder=2,
bbox={"boxstyle":"round", "pad":0.4, "facecolor":'0.9', "edgecolor":'none', "linewidth":0})
ax.text(0.5, h_num-1000,
f"1 : {ratio_num:.2f}", color="k", fontweight="bold", ha="center", va="center", zorder=2)
# 장르 총 합
ratio_genre = n_genres_nkr/n_genres_kr
h_genre = p*((n_genres_kr-n_movies_kr)/2 + n_movies_kr) + (1-p)*((n_genres_nkr-n_movies_nkr)/2 + n_movies_nkr)
ax.text(0.5, h_genre,
"장르 총 합\n", color="0.4", fontweight="bold", ha="center", va="center", zorder=2,
bbox={"boxstyle":"round", "pad":0.4, "facecolor":'w', "edgecolor":'0.9', "linewidth":2})
ax.text(0.5, h_genre-1000,
f"1 : {ratio_genre:.2f}", color="k", fontweight="bold", ha="center", va="center", zorder=2)
# line
ax.plot([0.2, 0.5, 0.8],
[n_movies_kr/2, p*n_movies_kr/2 + (1-p)*n_movies_nkr/2, n_movies_nkr/2],
c="0.2", alpha=0.3)
ax.plot([0.22, 0.5, 0.82],
[(n_genres_kr-n_movies_kr)/2 + n_movies_kr,
p*((n_genres_kr-n_movies_kr)/2 + n_movies_kr) + (1-p)*((n_genres_nkr-n_movies_nkr)/2 + n_movies_nkr),
(n_genres_nkr-n_movies_nkr)/2 + n_movies_nkr],
c="0.2", alpha=0.3)
ax.spines[["left", "top", "right"]].set_visible(False)
ax.set_xlim(-0.4, 1.4)
ax.set_xticks([0, 1])
ax.set_xticklabels([])
ax.set_yticks([])
# ax.set_yticks([0, 10000, 20000, 30000])
ax.grid(axis="x")
fig.savefig("./images/genres_movie_ratio.png", dpi=200)
# 틀 만들기
fig, axs = plt.subplots(ncols=2, gridspec_kw={"width_ratios":[5,1]},
sharey=True,
figsize=(14, 20), constrained_layout=True)
axs[0].set_title("국내 개봉 영화 (편)", fontdict=font_title, pad=16)
axs[1].set_title("제작 국가 비율", fontdict=font_title, pad=16)
portion_aspect0 = axs[0].get_position().height/axs[0].get_position().width
portion_aspect1 = axs[1].get_position().height/axs[1].get_position().width
print(portion_aspect0)
yticks = [1971] + list(range(1975, 2025, 5))
axs[0].set_ylim(2020, 1971)
axs[0].set_yticks(yticks)
axs[0].set_yticklabels(yticks)
axs[1].set_xticks([0, 1])
axs[1].set_xticklabels([0, "100%"])
# 국내 개봉 영화 편 수
fig_p0, ax_p0 = plt.subplots(figsize=(axs[0].get_position().height * 20, axs[0].get_position().width * 14), constrained_layout=True)
ax_p0.stackplot(df_nationsY["openYear"], -df_nationsY["N_한국"], colors=c_kr, ec="w", lw=0.5)
ax_p0.stackplot(df_nationsY["openYear"], df_nationsY["해외"], colors=c_etc, ec="w", lw=0.5)
nationsY_top20_foreign = []
for n in df_nations_top20.index[1:]:
nationsY_top20_foreign.append(df_nationsY[f'{n}'])
ax_p0.stackplot(df_nationsY["openYear"],
nationsY_top20_foreign,
colors=[c_us, c_jp, c_hk, c_fr, c_gb] + [c_etc]*14, ec="w", lw=0.5)
ax_p0.set_ylim(-1300, 1300)
ax_p0.set_xlim(1971, 2020)
ax_p0.axis(False)
fig_p0.savefig("./images/portion_year0.png", dpi=200, pad_inches=0)
# 영화 편 수 비율
fig_p, ax_p = plt.subplots(figsize=(axs[1].get_position().height * 20, axs[1].get_position().width * 14), constrained_layout=True)
ax_p.stackplot(df_nationsY.index,
df_nationsY["N_한국"]/df_nationsY.filter(like="N_").sum(axis=1),
df_nationsY["N_미국"]/df_nationsY.filter(like="N_").sum(axis=1),
df_nationsY["N_일본"]/df_nationsY.filter(like="N_").sum(axis=1),
df_nationsY["N_홍콩"]/df_nationsY.filter(like="N_").sum(axis=1),
df_nationsY["N_프랑스"]/df_nationsY.filter(like="N_").sum(axis=1),
df_nationsY["N_영국"]/df_nationsY.filter(like="N_").sum(axis=1),
colors=[c_kr, c_us, c_jp, c_hk, c_fr, c_gb], ec="w", lw=0.5)
ax_p.set_facecolor(c_etc)
ax_p.set_xlim(1, df_nationsY.shape[0]-1)
ax_p.set_ylim(0, 1)
ax_p.set_xticks([])
ax_p.set_yticks([])
ax_p.spines[["top", "bottom", "left", "right"]].set_visible(False)
fig_p.savefig("./images/portion_year1.png", dpi=200, pad_inches=0)
# 역사적 사건들
def plot_history(year, text, text_x_shift=50, text_y=None, text_c="green", text_size="medium", text_fc="w", text_align="center",
c_h0 = "limegreen", c_h1 = "palegreen", alpha_h1=0.7, ax=axs):
if not text_y:
text_y = year-0.5
y_line = [year] * 100
if np.array(axs == None).any():
ax = plt.gca()
x0_line = np.linspace(ax.get_xbound()[1], ax.get_xbound()[0], 100)
x1_line = 0
ax_0 = ax
elif isinstance(ax, np.ndarray):
x0_line = np.linspace(axs[0].get_xbound()[1], axs[0].get_xbound()[0], 100)
x1_line = np.linspace(axs[1].get_xbound()[1], axs[1].get_xbound()[0], 100)
ax_0 = ax[0]
else:
x0_line = np.linspace(axs[0].get_xbound()[1], axs[0].get_xbound()[0], 100)
x1_line = 0
ax_0 = ax
# axs[0]
for i in range(99):
ax_0.plot(x0_line[i:i+2], y_line[i:i+2], c=c_h0,
solid_capstyle='butt', alpha=np.power(np.sin(i/100),6)*2, zorder=15)
ax_0.text(ax_0.get_xbound()[0]+text_x_shift, text_y, text, c=text_c, fontsize=text_size,
multialignment=text_align, ha="left",
bbox={"boxstyle":"square", "pad":0.4, "facecolor":text_fc, "edgecolor":"none", "linewidth":1})
# axs[1:]
if isinstance(x1_line, np.ndarray):
for ax_ in ax[1:]:
for i in range(99):
ax_.plot(x1_line[i:i+2], y_line[i:i+2], c=c_h1,
solid_capstyle='butt', alpha=alpha_h1, zorder=15)
## 조립
# axs[0]
fig0_img = plt.imread("./images/portion_year0.png")
fig0_img = fig0_img.swapaxes(0, 1)[:,::-1, :][10:-10,10:-10,:]
print(fig0_img.shape)
x0, x1 = -1300, 1300
y1, y0 = 1971, 2020
axs[0].imshow(fig0_img, extent=[x0, x1, y0, y1])
axs[0].set_aspect(abs(x1-x0)/abs(y1-y0)*portion_aspect0 *20/14)
xticks = [-1000, -500, 0, 500, 1000]
axs[0].set_xticks(xticks)
axs[0].set_xticklabels([abs(x) for x in xticks])
for y in yticks:
axs[0].axhline(y, c="lightgray", alpha=0.2, zorder=-1)
axs[0].grid(False)
# plot_stem
def plot_stem(x, y, x0=0, s=100, ls="-", lw=2, c="cyan", marker="D",
text="sample", text_size="medium", position="right", bbox_lw=2, bbox_fc="w", ax=None, **kwargs):
ax.plot([x0, x], [y, y], ls=ls, lw=lw, c=c)
ax.scatter(x, y, s=s, c=c, marker=marker, **kwargs)
if text:
offset = 30
text_pos = x+offset if position == "right" else x-offset
ha = "left" if position == "right" else "right"
va = "center"
ax.text(text_pos, y, text, ha=ha, va=va, fontsize=text_size,
bbox={"boxstyle":"round", "pad":0.4,
"facecolor":bbox_fc, "edgecolor":c, "linewidth":bbox_lw},
zorder=20)
### Legends
handles, labels = axs[0].get_legend_handles_labels()
# 한국
plot_stem(-250, 1972, x0=-100, c=c_kr, bbox_lw=3, text=f" 한국: 총 {df_nations['N_한국'].sum()}편 ",
text_size="large", bbox_fc="aliceblue", position="left", ax=axs[0])
# 미국
plot_stem(200, 1982, x0=10, c=c_us, text=f"미국: 총 {df_nations_top20['N_미국']}편", ax=axs[0])
# 일본
plot_stem(600, 2008.5, x0=170, c=c_jp, text=f"일본: 총 {df_nations_top20['N_일본']}편", ax=axs[0])
# 프랑스
plot_stem(550, 1991.5, x0=220, c=c_fr, text=f"프랑스: 총 {df_nations_top20['N_프랑스']}편", ax=axs[0])
# 홍콩
plot_stem(400, 1988.5, x0=95, c=c_hk, text=f"홍콩: 총 {df_nations_top20['N_홍콩']}편", ax=axs[0])
# 영국
plot_stem(550, 2005, x0=170, c=c_gb, text=f"영국: 총 {df_nations_top20['N_영국']}편", ax=axs[0])
# 기타
plot_stem(700, 2012, x0=450, c=c_etc, text=f"기타: 총 {df_nations['해외'].sum()-df_nations_top20.iloc[1:6].sum()}편",
ax=axs[0])
axs[0].text(250, 1972, f" 해외: 총 {df_nations['해외'].sum()}편 ", ha="left", va="center",
bbox={"boxstyle":"round", "pad":0.4,
"facecolor":"0.95", "edgecolor":"k", "linewidth":3},
zorder=20)
# axs[1]
fig1_img = plt.imread("./images/portion_year1.png")
fig1_img = fig1_img.swapaxes(0, 1)[:,::-1, :][10:-10,10:-10,:]
x0, x1 = 0, 1
y1, y0 = 1971, 2020
axs[1].imshow(fig1_img, extent=[x0, x1, y0, y1])
axs[1].set_aspect(abs(x1-x0)/abs(y1-y0)*portion_aspect1 * 20/14)
axs[1].grid(False)
# 컬러TV 방송
plot_history(1975, "컬러TV 시험방송")
plot_history(1981, "컬러TV 본방송 개시")
# 헐리우드 직배영화
plot_history(1988, "헐리우드 직배개시")
# 한중수교
plot_history(1992, "한중 수교, 대만 단교")
# IMF
plot_history(1997, "IMF 외환위기", text_c="r", c_h0="crimson", c_h1="deeppink")
# 멀티플렉스
plot_history(1998, "멀티플렉스 (CGV 테크노마트)", text_y=1999)
# 일본 영화 전면 개방
plot_history(2004, "일본 영화 전면 개방 (일본문화 4차 개방)")
# 스크린쿼터
plot_history(2006, "스크린 쿼터 축소 (40% → 20%)")
# 글로벌 금융위기
plot_history(2008, "글로벌 금융위기", text_c="r", c_h0="crimson", c_h1="deeppink")
# 주말의 명화 폐지
plot_history(2010, "주말의 명화 폐지 (1969-2010)")
# 일본 쿨 재팬 전략
plot_history(2011, "일본 문화 홍보 전략 'Cool Japan' 추진", text_y=2012)
# 유튜브 프리미엄 개시
plot_history(2015, "유튜브 프리미엄 개시")
# 넷플릭스 상륙
plot_history(2016, "넷플릭스 상륙", text_y=2017)
# 코로나19
plot_history(2019, "코로나19")
axs[1].set_xlim(0, 1)
fig.savefig("./images/num_year.png", dpi=200)
display(fig)
# 기타 제외
genres_noetc = [g.split("_")[1] for g in df_genres_top21.index if "기타" not in g]
genres_noetc
# 장르별 색상
cmap = plt.get_cmap("tab20")
c_drama = cmap(0/20) # 드라마
c_romance = cmap(1/20) # 멜로/로맨스
c_action = cmap(2/20) # 액션
c_comedy = cmap(3/20) # 코미디
c_thriller = cmap(12/20) # 스릴러
c_ero = cmap(13/20) # 성인물
c_horror = cmap(6/20) # 공포
c_crime = cmap(7/20) # 범죄
c_ani = cmap(8/20) # 애니메이션
c_adv = cmap(9/20) # 어드벤처
c_sf = cmap(10/20) # SF
c_fantasy = cmap(11/20) # 판타지
c_mistery = cmap(4/20) # 미스터리
c_docu = cmap(5/20) # 다큐멘터리
c_family = cmap(14/20) # 가족
c_history = cmap(15/20) # 사극
c_war = cmap(16/20) # 전쟁
c_play = cmap(17/20) # 공연
c_musical = cmap(18/20) # 뮤지컬
c_western = cmap(19/20) # 서부극
c_genres = [c_drama, c_romance, c_action, c_comedy, c_thriller, c_ero, c_horror, c_crime, c_ani, c_adv, c_sf, c_fantasy, c_mistery, c_docu, c_family, c_history, c_war, c_play, c_musical, c_western, c_etc]
# 틀 만들기
fig, axs = plt.subplots(ncols=3, gridspec_kw={"width_ratios":[5,1,1]},
sharey=True,
figsize=(14, 20), constrained_layout=True)
axs[0].set_title("국내 개봉 영화 장르 (편)", fontdict=font_title, pad=16)
axs[1].set_title("한국 영화 비율", fontdict=font_title, pad=16)
axs[2].set_title("해외 영화 비율", fontdict=font_title, pad=16)
portion_aspect0 = axs[0].get_position().height/axs[0].get_position().width
portion_aspect1 = axs[1].get_position().height/axs[1].get_position().width
portion_aspect2 = axs[2].get_position().height/axs[2].get_position().width
yticks = [1971] + list(range(1975, 2025, 5))
axs[0].set_ylim(2020, 1971)
axs[0].set_yticks(yticks)
axs[0].set_yticklabels(yticks)
for ax in axs[1:]:
ax.set_xticks([0, 1])
ax.set_xticklabels([0, "100%"])
# 년도별 장르
sns.set_palette("tab20")
fig_p0, ax_p0 = plt.subplots(figsize=(axs[0].get_position().height * 20, axs[0].get_position().width * 14), constrained_layout=True)
stack_ys_kr = []
stack_ys_nkr = []
for g in genres_noetc:
stack_ys_kr.append(eval(f"-df_genresY_kr['G_{g}']"))
stack_ys_nkr.append(eval(f"df_genresY_nkr['G_{g}']"))
ax_p0.stackplot(df_genresY_kr["openYear"], *stack_ys_kr,
ec="w", lw=0.4, colors=c_genres[:20])
ax_p0.stackplot(df_genresY_nkr["openYear"], *stack_ys_nkr,
ec="w", lw=0.4, colors=c_genres[:20])
# G_기타
ax_p0.stackplot(df_genresY_kr["openYear"],
-df_genresY_kr.filter(like="G_").sum(axis=1),
ec="w", lw=0.4, zorder=-1)
ax_p0.stackplot(df_genresY_nkr["openYear"],
df_genresY_nkr.filter(like="G_").sum(axis=1),
ec="w", lw=0.4, zorder=-1)
ax_p0.get_children()[40].set_facecolor(c_etc)
ax_p0.get_children()[41].set_facecolor(c_etc)
ax_p0.set_xlim(1971, 2020)
ax_p0.set_ylim(-2200, 2200)
ax_p0.axis(False)
fig_p0.savefig("./images/genres_year0.png")
# 한국영화 장르 비율
stack_ys_kr.append(-df_genresY_kr['G_기타'])
stack_ys_kr_p = np.array(stack_ys_kr)/np.array(stack_ys_kr).sum(axis=0)
stack_ys_kr_p.shape
fig_p1, ax_p1 = plt.subplots(figsize=(axs[1].get_position().height * 20, axs[1].get_position().width * 14), constrained_layout=True)
ax_p1.stackplot(df_genresY_kr["openYear"], *stack_ys_kr_p, colors=c_genres,
ec="none", lw=0.5)
ax_p1.get_children()[20].set_facecolor(c_etc)
ax_p1.set_xlim(1971, 2020)
ax_p1.set_ylim(0, 1)
ax_p1.set_xticks([])
ax_p1.set_yticks([])
ax_p1.spines[["top", "bottom", "left", "right"]].set_visible(False)
fig_p1.savefig("./images/genres_year1.png", dpi=200)
# 해외영화 장르 비율
stack_ys_nkr.append(-df_genresY_nkr['G_기타'])
stack_ys_nkr_p = np.array(stack_ys_nkr)/np.array(stack_ys_nkr).sum(axis=0)
stack_ys_nkr_p.shape
fig_p2, ax_p2 = plt.subplots(figsize=(axs[2].get_position().height * 20, axs[2].get_position().width * 14), constrained_layout=True)
ax_p2.stackplot(df_genresY_nkr["openYear"], *stack_ys_nkr_p, colors=c_genres,
ec="none", lw=0.5)
ax_p2.get_children()[20].set_facecolor(c_etc)
ax_p2.set_xlim(1971, 2020)
ax_p2.set_ylim(0, 1)
ax_p2.set_xticks([])
ax_p2.set_yticks([])
ax_p2.spines[["top", "bottom", "left", "right"]].set_visible(False)
fig_p2.savefig("./images/genres_year2.png", dpi=200)
## 조립
# axs[0]
fig0_img = plt.imread("./images/genres_year0.png")
fig0_img = fig0_img.swapaxes(0, 1)[:,::-1, :][10:-10,10:-10,:]
x0, x1 = -2200, 2200
y1, y0 = 1971, 2020
axs[0].imshow(fig0_img, extent=[x0, x1, y0, y1])
axs[0].set_aspect(abs(x1-x0)/abs(y1-y0)*portion_aspect0*20/14)
axs[0].plot(-df_nationsY["N_한국"], df_nationsY["openYear"],
c="k", lw=2, zorder=19, label="개봉 영화 편 수")
axs[0].plot(df_nationsY["해외"], df_nationsY["openYear"],
c="k", lw=2, zorder=19)
axs[0].grid(False)
axs[0].set_xlim(-2200, 2200)
xticks = [-2000, -1000, 0, 1000, 2000]
axs[0].set_xticks(xticks)
axs[0].set_xticklabels([abs(x) for x in xticks])
yticks = [1971] + list(range(1975, 2025, 5))
axs[0].set_yticks(yticks)
axs[0].set_yticklabels(yticks)
axs[0].text(-1100, 1971, f" 한국 영화 ", ha="center", va="center",
bbox={"boxstyle":"round", "pad":0.4,
"facecolor":"aliceblue", "edgecolor":"k", "linewidth":3},
zorder=20)
axs[0].text(1100, 1971, f" 해외 영화 ", ha="center", va="center",
bbox={"boxstyle":"round", "pad":0.4,
"facecolor":"0.95", "edgecolor":"k", "linewidth":3},
zorder=20)
# axs[1] 한국
fig1_img = plt.imread("./images/genres_year1.png")
fig1_img = fig1_img.swapaxes(0, 1)[:,::-1, :][10:-10,10:-10,:]
x0, x1 = 0, 1
y1, y0 = 1971, 2020
axs[1].imshow(fig1_img, extent=[x0, x1, y0, y1])
axs[1].set_aspect(abs(x1-x0)/abs(y1-y0)*portion_aspect1*20/14)
axs[1].grid(False)
# axs[2] 해외
fig2_img = plt.imread("./images/genres_year2.png")
fig2_img = fig2_img.swapaxes(0, 1)[:,::-1, :][10:-10,10:-10,:]
x0, x1 = 0, 1
y1, y0 = 1971, 2020
axs[2].imshow(fig2_img, extent=[x0, x1, y0, y1])
axs[2].set_aspect(abs(x1-x0)/abs(y1-y0)*portion_aspect2*20/14)
axs[2].grid(False)
### history
# 컬러TV 방송
plot_history(1975, "컬러TV 시험방송", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
plot_history(1981, "컬러TV 본방송 개시", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 헐리우드 직배영화
plot_history(1988, "헐리우드 직배개시", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 한중수교
plot_history(1992, "한중 수교, 대만 단교", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# IMF
plot_history(1997, "IMF 외환위기", text_x_shift=100, text_fc='none', alpha_h1=0.3,
text_c="r", c_h0="crimson", c_h1="deeppink", ax=axs)
# 멀티플렉스
plot_history(1998, "멀티플렉스 (CGV 테크노마트)", text_x_shift=100, text_fc='none', alpha_h1=0.3,
text_y=1999, ax=axs)
# 일본 영화 전면 개방
plot_history(2004, "일본 영화 전면 개방 (4차 개방)", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 스크린쿼터
plot_history(2006, "스크린 쿼터 축소 (40% → 20%)", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 글로벌 금융위기
plot_history(2008, "글로벌 금융위기", text_x_shift=100, text_fc='none', alpha_h1=0.3,
text_c="r", c_h0="crimson", c_h1="deeppink", ax=axs)
# 주말의 명화 폐지
plot_history(2010, "주말의 명화 폐지 (1969-2010)",
text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 일본 쿨 재팬 전략
plot_history(2011, "일본 문화 홍보 전략 \n'Cool Japan' 추진", text_x_shift=100, text_fc='none', alpha_h1=0.3,
text_y=2012.5, ax=axs)
# 유튜브 프리미엄 개시
plot_history(2015, "유튜브 프리미엄 개시", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
# 넷플릭스 상륙
plot_history(2016, "넷플릭스 상륙", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs, text_y=2017)
# 코로나19
plot_history(2019, "코로나19", text_x_shift=100, text_fc='none', alpha_h1=0.3, ax=axs)
### legend
genres_order = genres_noetc + ["기타"]
for i, c in enumerate(c_genres):
axs[0].bar(0, 1, label=genres_order[i], fc=c, zorder=-2)
axs[0].bar(0, 1, label="기타", zorder=-2)
axs[0].patches[20].set_facecolor(c_etc)
handles, labels = axs[0].get_legend_handles_labels()
axs[0].legend(handles=handles[:-1], labels=labels[:-1],
fontsize="medium", bbox_to_anchor=(0.97, 0.95), loc="upper right")
fig.savefig("./images/genre_year.png", dpi=300)
display(fig)
#국가별 상위 장르
# 1. 한국
idx_movies_kr = df_nations.query("N_한국 == 1")[["movieCd", "movieNm", "openYear"]].index
genres_portion_kr = df_genres.loc[idx_movies_kr].filter(like="G_").sum().sort_values(ascending=True)/len(idx_movies_kr)
genres_portion_kr_idx = genres_portion_kr.index
genres_portion_kr_v = genres_portion_kr.values
# 2. 미국
idx_movies_us = df_nations.query("N_미국 == 1")[["movieCd", "movieNm", "openYear"]].index
genres_portion_us = df_genres.loc[idx_movies_us].filter(like="G_").sum().sort_values(ascending=True)/len(idx_movies_us)
genres_portion_us_v = genres_portion_us.loc[genres_portion_kr_idx]
# 3. 일본
idx_movies_jp = df_nations.query("N_일본 == 1")[["movieCd", "movieNm", "openYear"]].index
genres_portion_jp = df_genres.loc[idx_movies_jp].filter(like="G_").sum().sort_values(ascending=True)/len(idx_movies_jp)
genres_portion_jp_v = genres_portion_jp.loc[genres_portion_kr_idx]
# 4. 홍콩
idx_movies_hk = df_nations.query("N_홍콩 == 1")[["movieCd", "movieNm", "openYear"]].index
genres_portion_hk = df_genres.loc[idx_movies_hk].filter(like="G_").sum().sort_values(ascending=True)/len(idx_movies_hk)
genres_portion_hk_v = genres_portion_hk.loc[genres_portion_kr_idx]
# 5. 프랑스
idx_movies_fr = df_nations.query("N_프랑스 == 1")[["movieCd", "movieNm", "openYear"]].index
genres_portion_fr = df_genres.loc[idx_movies_fr].filter(like="G_").sum().sort_values(ascending=True)/len(idx_movies_fr)
genres_portion_fr_v = genres_portion_fr.loc[genres_portion_kr_idx]
fig, axs = plt.subplots(ncols=5, figsize=(10, 8), constrained_layout=True, sharey=True, sharex=True)
for ax, n, c, p in zip(axs,
["한국", "미국", "일본", "홍콩", "프랑스"],
[c_kr, c_us, c_jp, c_hk, c_fr],
[genres_portion_kr_v, genres_portion_us_v, genres_portion_jp_v, genres_portion_hk_v, genres_portion_fr_v]):
ax.barh(genres_portion_kr_idx, p,
color=[c_western, c_musical, c_play, c_adv, c_war, c_sf, c_fantasy,
c_family, c_ani, c_etc, c_horror, c_mistery, c_crime, c_ero,
c_history, c_thriller, c_docu, c_comedy, c_action, c_romance, c_drama],
height=0.8, ec="k", lw=0.5)
ax.spines[["top", "right", "bottom"]].set_visible(False)
ax.grid(axis="y", alpha=0)
ax.grid(axis="x", alpha=0.5)
ax.axvline(0.5, c="orange", alpha=0.4, zorder=-1)
ax.axvspan(0, 0.5, fc="yellow", alpha=0.1, zorder=-1)
ax.tick_params(color="w")
xbound = ax.get_xbound()
xticks = [x for x in np.linspace(0, 1, 3) if xbound[0] <= x <= xbound[1]]
ax.set_xticks(xticks)
ax.set_xticklabels([])
yticklabels = [g.split("_")[1] for g in genres_portion_kr_idx]
ax.set_yticklabels(yticklabels, size="small", fontweight="bold")
ax.set_title(n, fontdict=font_title, pad=20)
for y in [5, 10, 15]:
ax.axhline(y, c="lightgray", ls=":", lw=1, zorder=-1)
axs[0].set_ylim(-0.5, len(genres_portion_kr)-0.5)
fig.legend(handles=[ax.get_children()[22]], labels=["50%"])
fig.suptitle("국가별 개봉작 장르 비율 (%)\n", fontweight="bold")
fig.savefig("./images/genres_movie_ratio_nations.png", dpi=200)
# 장르별 흐름
fig, axes = plt.subplots(ncols=4, nrows=5, figsize=(16, 20),
gridspec_kw={"hspace":0.1},
sharex=True, constrained_layout=True)
axs = axes.ravel()
z_krs = [1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2]
genres_order = genres_noetc + ["기타"]
for g, ax, c, z_kr in zip(genres_order[:20], axs, c_genres[:20], z_krs):
genre_kr = df_genresY_kr[f"G_{g}"]/df_genresY_kr.filter(like="G_").sum(axis=1)
genre_nkr = df_genresY_nkr[f"G_{g}"]/df_genresY_nkr.filter(like="G_").sum(axis=1)
z_nkr = 2 if z_kr == 1 else 1
ax.fill_between(df_genresY_kr["openYear"],
genre_kr * 100, 0,
fc=c, ec="k", alpha=0.5, label="한국 영화", zorder=z_kr)
ax.fill_between(df_genresY_nkr["openYear"],
genre_nkr * 100, 0,
fc=c, ec=c, alpha=0.5, label="해외 영화", zorder=z_nkr)
# ax.text(2000, 0, g, ha="center")
ax.set_title(g, fontdict=font_title, fontsize="medium", pad=12)
ax.set_xlim(1971, 2020)
ax.set_ylim(0, )
years = [1971, 1988, 1999, 2011, 2016]
for year in years:
ax.axvline(year, alpha=0.3, c=c_etc)
text_x = year-0.5 if year!= 1971 else year+1
ha = "right" if year != 1971 else "left"
ax.text(text_x, ax.get_ybound()[1], year, va="top", ha=ha, fontsize="small", c=c_etc, rotation=90, zorder=-1)
ax.set_xticks(years)
ax.set_xticklabels([])
ax.grid(False)
ax.tick_params(labelsize="small", pad=0)
### label
handles, labels = axs[15].get_legend_handles_labels()
fig.legend(handles=handles[:2], labels=labels[:2], fontsize="medium", ncol=2)
fig.suptitle("장르별 개봉작 수 점유율 (%)\n", fontweight="bold", color="k")
fig.savefig("./images/genres_separate.png", dpi=200)
# 국가별 장르 분할
# 미국
df_genres_us = df_genres.loc[df_nations["N_미국"]==1]
df_genresY_us = df_genres_us[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_us.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_us.drop("movieCd", axis=1, inplace=True)
df_genresY_us.fillna(0, inplace=True)
df_genresY_us["openYear"] = df_genresY_us["openYear"].astype(int)
# 일본
df_genres_jp = df_genres.loc[df_nations["N_일본"]==1]
df_genresY_jp = df_genres_jp[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_jp.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_jp.drop("movieCd", axis=1, inplace=True)
df_genresY_jp.fillna(0, inplace=True)
df_genresY_jp["openYear"] = df_genresY_jp["openYear"].astype(int)
# 홍콩
df_genres_hk = df_genres.loc[df_nations["N_홍콩"]==1]
df_genresY_hk = df_genres_hk[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_hk.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_hk.drop("movieCd", axis=1, inplace=True)
df_genresY_hk.fillna(0, inplace=True)
df_genresY_hk["openYear"] = df_genresY_hk["openYear"].astype(int)
# 프랑스
df_genres_fr = df_genres.loc[df_nations["N_프랑스"]==1]
df_genresY_fr = df_genres_fr[["movieCd", "openYear"]].groupby("openYear").count().reset_index().merge(df_genres_fr.iloc[:,2:].groupby("openYear").sum().reset_index())
df_genresY_fr.drop("movieCd", axis=1, inplace=True)
df_genresY_fr.fillna(0, inplace=True)
df_genresY_fr["openYear"] = df_genresY_fr["openYear"].astype(int)
# 최다 5개 국가
N_top5 = ["한국", "미국", "일본", "홍콩", "프랑스"]
# 장르별 색상
G_colors = dict(zip([f"G_{g}" for g in genres_order], c_genres))
# 시각화 함수
def plot_time_GN_(nation, times, ax, genres=[], topN=3):
time_init = times[0]
time_fin = times[1]
if nation == "한국":
df = df_genresY_kr
elif nation == "미국":
df = df_genresY_us
elif nation == "일본":
df = df_genresY_jp
elif nation == "홍콩":
df = df_genresY_hk
elif nation == "프랑스":
df = df_genresY_fr
# openYear 결측치 메우기
year_zero = list(set(range(1971, 2021)) - set(df["openYear"].values))
data_zero = np.zeros((len(year_zero), 23))
data_zero[:,0] = year_zero
df_zero = pd.DataFrame(data=data_zero, columns=df_genresY_jp.columns)
df_zero["openYear"] = df_zero["openYear"].astype(int)
df = pd.concat([df, df_zero], axis=0).sort_values("openYear").reset_index()
# 국가별, 장르별 데이터 정리
df_nationsY_time = df_nationsY.query(f"{time_init} <= openYear <= {time_fin}")
df_genresY_ntime = df.query(f"{time_init} <= openYear <= {time_fin}")
G_sort = df_genresY_ntime.filter(like="G_").sum().sort_values(ascending=False)
G_sort_top = G_sort[:topN]
if len(genres) == 0:
G_names = list(G_sort_top.index)
G_counts = list(G_sort_top.values)
else:
G_names = genres
G_counts = G_sort.loc[genres].values
# 해당 국가 개봉 편 수
ax.plot(df_nationsY_time["openYear"], df_nationsY_time[f"N_{nation}"],
c="k", lw=2, zorder=10, label="개봉 편 수")
ax.stackplot(df_genresY_ntime["openYear"], [df_genresY_ntime[g] for g in G_names],
colors=[G_colors[g] for g in G_names], ec="w", lw=0.5, zorder=5,
labels=[f"{g.split('_')[1]}: {int(c)}" for g, c in zip(G_names, G_counts)])
ax.set_xlim(times)
time_delta = int((time_fin - time_init)/4)
xticks = list(range(time_init, time_fin+time_delta, time_delta))
ax.set_xticks(xticks)
ax.set_ylim(0, )
ax.grid(False)
for x in xticks:
ax.axvline(x, c="lightgray", ls=":")
ax.spines[["top", "right"]].set_visible(False)
ax.set_title(nation, fontdict=font_title)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels= labels[1:], fontsize="small",
# title=nation, title_fontsize="small",
loc="upper left", bbox_to_anchor=(1, 1), ncol=1)
return handles, labels
def plot_time_GN(time_init, time_fin, topN=3):
fig, axs = plt.subplots(nrows=5, figsize=(8, 10),
gridspec_kw={"hspace":0.1},
constrained_layout=True, sharex=True)
for N, ax in zip(N_top5, axs):
handles, labels = plot_time_GN_(N, (time_init, time_fin), ax, topN=topN)
# fig.legend(handles=handles[:1], labels=["개봉작 (편)"],
# loc="upper right", fontsize="small")
fig.align_ylabels(axs)
fig.suptitle(f"주요 5개국 장르별 개봉작 수 (편, {time_init}-{time_fin})\n", fontweight="bold")
fig.savefig(f"./images/genres_nations_{time_init}-{time_fin}.png", dpi=200)
plot_time_GN(1971, 1987)
# 한국 영화 목록 (드라마)
df_genres.query("1971 <= openYear <= 1987").loc[df_nations["N_한국"]==1].query("G_드라마 == 1")["movieNm"].values
# 미국 영화 목록 (액션)
df_genres.query("1971 <= openYear <= 1987").loc[df_nations["N_미국"]==1].query("G_액션 == 1")["movieNm"].values
# 미국 영화 목록 (코미디)
df_genres.query("1971 <= openYear <= 1987").loc[df_nations["N_미국"]==1].query("G_코미디 == 1")["movieNm"].values
# 일본 영화 목록 (전체)
df_genres.query("1971 <= openYear <= 1987").loc[df_nations["N_일본"]==1]
plot_time_GN(1988, 1998, topN=4)
# 미국 영화 목록 (드라마)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_미국"]==1].query("G_드라마 == 1")["movieNm"].values
# 미국 영화 목록 (스릴러)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_미국"]==1].query("G_스릴러 == 1")["movieNm"].values
# 미국 영화 목록 (애니메이션)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_미국"]==1].query("G_애니메이션 == 1")["movieNm"].values
# 홍콩 영화 목록 (액션)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_홍콩"]==1].query("G_액션 == 1")["movieNm"].values
# 홍콩 영화 목록 (범죄)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_홍콩"]==1].query("G_범죄 == 1")["movieNm"].values
# 한국 영화 목록 (코미디)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_한국"]==1].query("G_코미디 == 1")["movieNm"].values
# 한국 영화 목록 (스릴러)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_한국"]==1].query("G_스릴러 == 1")["movieNm"].values
# 한국 영화 목록 (공포)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_한국"]==1].query("G_공포 == 1")["movieNm"].values
# 한국 영화 목록 (여고괴담 시리즈)
df_genres.loc[df_genres["movieNm"].str.contains("여고")]
# 한국 영화 목록 (드라마)
df_genres.query("1988 <= openYear <= 1998").loc[df_nations["N_한국"]==1].query("G_드라마 == 1")["movieNm"].values
plot_time_GN(1999, 2010, topN=4)
# 한국, 미국, 일본 개봉 편 수 비교
from matplotlib.patches import FancyArrowPatch
fig, axs = plt.subplots(ncols=3, figsize=(10, 5), constrained_layout=True, sharex=True)
for (ax, n, c) in zip(axs, ["한국", "미국", "일본"], [c_kr, c_us, c_jp]):
df = df_nationsY.query("1988 <= openYear <= 2012")
ax.stackplot(df["openYear"], df[f"N_{n}"], colors=[c], ec="k", lw=2, alpha=0.5)
min_x = df.query("1999 <= openYear <= 2010")["openYear"].iloc[df.query("1999 <= openYear <= 2010")[f"N_{n}"].argmin()]
min_y = df.query("1999 <= openYear <= 2010")[f"N_{n}"].min()
max_x = df.query("1999 <= openYear <= 2010")["openYear"].iloc[df.query("1999 <= openYear <= 2010")[f"N_{n}"].argmax()]
max_y = df.query("1999 <= openYear <= 2010")[f"N_{n}"].max()
ax.scatter(min_x, min_y, s=100, ec="r", lw=3, fc="w")
ax.scatter(max_x, max_y, s=100, ec="b", lw=3, fc="w")
arrow_start_x, arrow_end_x = (min_x, max_x) if min_x < max_x else (max_x, min_x)
arrow_start_y, arrow_end_y = (min_y, max_y) if min_x < max_x else (max_y, min_y)
arrow = FancyArrowPatch((arrow_start_x, arrow_start_y), (arrow_end_x, arrow_end_y),
shrinkA=10, shrinkB=10, mutation_scale=50, fc="gray", alpha=0.8)
ax.add_artist(arrow)
shift_x = 0
if n == "한국":
shift_x = -3
elif n == "미국":
shift_x = 5
elif n == "일본":
shift_x = -4
ax.text((min_x + max_x)/2 +shift_x, (min_y + max_y)/2, f"{(arrow_end_y - arrow_start_y)/arrow_start_y*100:.1f} %",
fontsize="large", fontweight="bold", ha="center", va="bottom")
ax.axvline(1999, lw=3, c="#00FF00", alpha=0.5)
ax.axvline(2010, lw=3, c="#00FF00", alpha=0.5)
ax.set_xlim(1990, 2012)
ax.set_title(f"{n} 영화 개봉 편 수", fontdict=font_title, pad=16)
fig.suptitle("1999-2010 국가별 영화 개봉 편수 최대 변동폭\n", fontweight="bold")
fig.savefig("./images/num_delta.png", dpi=200)
# 한국 성인물 vs 액션
fig, ax = plt.subplots(figsize=(10,5), constrained_layout=True)
ax.stackplot(df_genresY_kr.query("1988 <= openYear <= 2010")["openYear"],
[df_genresY_kr.query("1988 <= openYear <= 2010")["G_성인물"],
df_genresY_kr.query("1988 <= openYear <= 2010")["G_스릴러"],
df_genresY_kr.query("1988 <= openYear <= 2010")["G_공포"],
df_genresY_kr.query("1988 <= openYear <= 2010")["G_코미디"],
df_genresY_kr.query("1988 <= openYear <= 2010")["G_멜로/로맨스"]],
colors=[c_ero, c_thriller, c_horror, c_comedy, c_romance], labels=["성인물", "스릴러", "공포", "코미디", "멜로/로맨스"])
ax.set_xlim(1988, 2010)
ax.set_title("일부 장르 개봉 편 수 (1988-2020)", fontdict=font_title, pad=16)
ax.text(1993, 5, "성인물", fontweight="bold", ha="center")
ax.text(2008, 5, "스릴러", fontweight="bold", ha="center")
ax.annotate("공포", (2007, 22), (2008, 30), c=c_horror, fontweight="bold", ha="center",
arrowprops={"width":3, "fc":c_horror})
ax.text(2003, 20, "코미디", fontweight="bold", ha="center")
ax.text(1991.5, 50, "멜로/로맨스", fontweight="bold", ha="center")
xticks = [1988, 1990, 1995, 2000, 2005, 2010]
ax.set_xticks(xticks)
fig.savefig("./images/genre_change_1988.png", dpi=200)
df_genresY_kr.query("1988 <= openYear <= 1998")["G_성인물"]/df_nationsY.query("1988 <= openYear <= 1998")["N_한국"]
print(df_genres_kr.query("1988 <= openYear <= 1998").query("G_성인물==1")["movieNm"].values)
# 한국 영화 목록 (코미디)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_한국"]==1].query("G_코미디 == 1")["movieNm"].values
# 일본 영화 목록 (코미디)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_일본"]==1].query("G_코미디 == 1")["movieNm"].values
# 한국 영화 목록 (액션)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_한국"]==1].query("G_액션 == 1")["movieNm"].values
# 한국 영화 목록 (스릴러)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_한국"]==1].query("G_스릴러 == 1")["movieNm"].values
# 한국 영화 목록 (멜로/로맨스)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_한국"]==1].loc[df_genres["G_멜로/로맨스"]==1]["movieNm"].values
# 프랑스 영화 목록 (스릴러)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_프랑스"]==1].query("G_스릴러 == 1")["movieNm"].values
# 한국 영화 목록 (미국)
df_genres.query("1999 <= openYear <= 2010").loc[df_nations["N_미국"]==1].query("G_스릴러 == 1")["movieNm"].values
plot_time_GN(2011, 2020, topN=4)
# 일본 영화 목록 (성인물)
print(df_genres.query("2011 <= openYear <= 2020").loc[df_nations["N_일본"]==1].query("G_성인물 == 1")["movieNm"].values[:100])
# 한국 영화 목록 (성인물)
print(df_genres.query("2011 <= openYear <= 2020").loc[df_nations["N_한국"]==1].query("G_성인물 == 1")["movieNm"].values)
# 한국 영화 목록 (멜로/로맨스)
print(df_genres.query("2011 <= openYear <= 2020").loc[df_nations["N_한국"]==1].loc[df_genres["G_멜로/로맨스"] != 1]["movieNm"].values[:100])
# 다른 장르에 숨어있는, 성인물로 추정되는 영화들
df_genres.query("2011 <= openYear <= 2020").loc[df_nations["N_한국"]==1].loc[df_genres["G_성인물"] != 1].loc[df_genres["movieNm"].str.contains("섹스")]