モザイクプロット

モザイクプロット#

準備#

Import#

変数#

関数#

Show code cell content Hide code cell content

def create_mosaicplot(
    df: pd.DataFrame,
    x: str,
    y: str,
    color: str,
    width: str,
    text: str,
    color_discrete_sequence: List[str] = OKABE_ITO,
) -> go.Figure:
    """
    指定されたDataFrameを元にモザイクプロットを作成する関数

    Parameters
    ----------
    df : pd.DataFrame
        プロットに使用するデータが含まれるDataFrame
    x : str
        x軸に表示するデータのカラム名
    y : str
        y軸に表示するデータのカラム名
    color : str
        グループ分けの基準となるデータのカラム名
    width : str
        各バーの幅を表すデータのカラム名
    text : str
        各バーに表示するテキストのデータのカラム名
    color_discrete_sequence : List[str], optional
        使用する色のリスト デフォルトはOKABE_ITOのカラーパレット

    Returns
    -------
    go.Figure
        作成されたモザイクプロットのFigureオブジェクト
    """

    # 空のFigureオブジェクトを作成
    fig = go.Figure()

    # color列に登場するユニークな要素に対し、色をマッピング
    unique_keys = df[color].unique()
    color_map = {
        name: color for name, color in zip(unique_keys, color_discrete_sequence)
    }

    # color列のユニークな要素ごとにDataFrameをフィルタリング
    for i, name in enumerate(unique_keys):
        df_tmp = df[df[color] == name].reset_index(drop=True)
        # 幅をwidth列から抽出
        widths = df_tmp[width]

        # バーの位置を計算し、プロットに追加
        # 幅が変わるようxの値を調整
        fig.add_trace(
            go.Bar(
                name=name,
                x=df_tmp[width].cumsum() - widths,
                y=df_tmp[y],
                text=df_tmp[text],
                width=widths,
                offset=0,
                marker_color=color_map[name],
            )
        )

        # 最初の要素を用いて、X軸ラベルの設定値を作成
        if i == 0:
            # 各「棒」の中央に配置されるように座標を計算
            tickvals = df_tmp[width].cumsum() - df_tmp[width] / 2
            ticktext = df_tmp[x].unique()
            # x軸の表示範囲を決定するために利用
            x_max = df_tmp[width].sum()

    # x軸の目盛りの位置、テキスト、表示範囲を設定
    # 「棒」の太さの合計値を1としたとき、左右に0.1ずつ余白が残るように調整
    fig.update_xaxes(
        tickvals=tickvals, ticktext=ticktext, title=x, range=[-x_max * 0.1, x_max * 1.1]
    )

    # y軸のタイトルを設定
    fig.update_yaxes(title=y)

    # プロットのレイアウトを設定、凡例タイトルも指定
    fig.update_layout(barmode="stack", legend_title=color)

    return fig

可視化例#

マンガデータ#

Show code cell content Hide code cell content

# 可視化対象のDataFrameを確認
df_cm.head()

	マンガ雑誌名	年代	マンガ作者数	years_total	マンガ作者数のシェア	text
0	週刊少年サンデー	1970	197	866	0.227483	0.23
1	週刊少年サンデー	1980	208	886	0.234763	0.23
2	週刊少年サンデー	1990	182	768	0.236979	0.24
3	週刊少年サンデー	2000	181	879	0.205916	0.21
4	週刊少年サンデー	2010	191	946	0.201903	0.2

Show code cell content Hide code cell content

# 週刊少年チャンピオンのデータを抽出
df_cm[df_cm["マンガ雑誌名"] == "週刊少年チャンピオン"]

	マンガ雑誌名	年代	マンガ作者数	years_total	マンガ作者数のシェア	text
10	週刊少年チャンピオン	1970	182	866	0.210162	0.21
11	週刊少年チャンピオン	1980	239	886	0.269752	0.27
12	週刊少年チャンピオン	1990	208	768	0.270833	0.27
13	週刊少年チャンピオン	2000	265	879	0.301479	0.3
14	週刊少年チャンピオン	2010	256	946	0.270613	0.27

アニメデータ#

Show code cell content Hide code cell content

# 可視化対象のDataFrameを確認
df_an.head()

	性別	年代	声優数	合計声優数	声優数のシェア	text
0	female	2000	561	1113	0.504043	0.50
1	female	2005	844	1695	0.497935	0.50
2	female	2010	856	1654	0.517533	0.52
3	female	2015	597	1149	0.519582	0.52
4	male	2000	552	1113	0.495957	0.50

Show code cell content Hide code cell content

# 可視化対象のDataFrameを確認
df_an2.head()

	性別	年代	声優数	合計声優数	声優数のシェア	text
0	female	2000	127	252	0.503968	0.50
1	female	2001	145	287	0.505226	0.51
2	female	2002	147	288	0.510417	0.51
3	female	2003	173	342	0.505848	0.51
4	female	2004	202	407	0.496314	0.50

ゲームデータ#

Show code cell content Hide code cell content

# 可視化対象のDataFrameを確認
df_gm.head()

	発売年代	weekday	パッケージ数	発売曜日	合計パッケージ数	パッケージ数のシェア	text
0	1990	0	34	月	2074	0.016393	0.016
1	1990	1	81	火	2074	0.039055	0.039
2	1990	2	66	水	2074	0.031823	0.032
3	1990	3	122	木	2074	0.058824	0.059
4	1990	4	1559	金	2074	0.751688	0.75

モザイクプロット

Contents

モザイクプロット#

準備#

Import#

変数#

関数#

可視化例#

マンガデータ#

アニメデータ#

ゲームデータ#