# Field Analysis

> Latest stats are from 2021, either because scimago stopped collecting data, or newer data is deemed unreliable regarding citations

In [1]:
import numpy as np
import pandas as pd

In [2]:
yearly_df = pd.read_csv("journal_record.csv")

In [3]:
df = pd.read_csv("journal.csv").merge(
    yearly_df.loc[lambda _df: _df["year"] == 2021],
    left_on="sourceid",
    right_on="journal__sourceid",
)

In [4]:
area_df = pd.read_csv("journal_area.csv")

In [5]:
def get_cats(df):
    _cs = df["categories"].str.split("; ")
    idk = "sourceid"
    _df = pd.DataFrame(
        {"catbase": _cs.sum(), idk: np.repeat(df[idk].values, _cs.str.len())}
    )
    return pd.concat([_df, _df["catbase"].str.extract("(.*) \((.*)\)")], axis=1).assign(
        field=lambda df: np.where(df.loc[:, 0].isna(), df["catbase"], df.loc[:, 0])
    ).rename(columns={1: "Q"}).loc[:, ["field", "Q", idk]]

In [6]:
cat_base = get_cats(df.head(10000)).merge(df)

In [7]:
field_pivot = (
    cat_base.fillna("no Q")
    .pivot_table(index="field", columns="Q", values="sourceid", aggfunc="count")
    .fillna(0)
    .assign(s=lambda df: df.sum(axis=1))
    .sort_values("s", ascending=False)
    .loc[:, lambda df: df.columns.str.startswith("Q")]
)
field_pivot.head(15).style.background_gradient(axis=1).set_caption(
    "Count values by fields"
)

Q,Q1,Q2,Q3,Q4
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Medicine (miscellaneous),389.0,426.0,374.0,307.0
Sociology and Political Science,197.0,97.0,58.0,50.0
Mechanical Engineering,114.0,87.0,64.0,68.0
"Geography, Planning and Development",98.0,86.0,75.0,74.0
Electrical and Electronic Engineering,96.0,71.0,60.0,66.0
History,111.0,77.0,63.0,63.0
"Ecology, Evolution, Behavior and Systematics",99.0,92.0,82.0,39.0
Economics and Econometrics,127.0,87.0,48.0,35.0
Condensed Matter Physics,75.0,79.0,73.0,57.0
Education,152.0,66.0,31.0,25.0


In [8]:
def draw_table(df):
    return (
        df.pivot_table(index="field", columns="Q")
        .loc[field_pivot.head(15).index]
        .loc[:, lambda df: df.isna().mean() < 0.5]
        .style.background_gradient(axis=0)
    )

In [9]:
(
    cat_base.groupby(["field", "Q"])[["h_index", "journal_rating", "total_docs_3years"]]
    .mean()
    .pipe(draw_table)
    .set_caption("Mean values of fields")
)

Unnamed: 0_level_0,h_index,h_index,h_index,h_index,journal_rating,journal_rating,journal_rating,journal_rating,total_docs_3years,total_docs_3years,total_docs_3years,total_docs_3years
Q,Q1,Q2,Q3,Q4,Q1,Q2,Q3,Q4,Q1,Q2,Q3,Q4
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Medicine (miscellaneous),138.403599,80.525822,49.620321,21.368078,1.952355,0.664251,0.375511,0.157072,850.737789,573.298122,381.534759,283.85342
Sociology and Political Science,84.091371,38.340206,20.431034,11.32,1.497914,0.387918,0.193,0.11742,203.984772,143.391753,106.0,75.18
Mechanical Engineering,128.877193,60.908046,29.71875,17.382353,1.562754,0.529828,0.290672,0.141176,1171.219298,559.816092,387.484375,410.220588
"Geography, Planning and Development",76.653061,38.674419,22.093333,12.635135,1.229337,0.454767,0.242347,0.128973,281.77551,120.046512,87.6,63.054054
Electrical and Electronic Engineering,144.25,78.830986,36.816667,19.0,1.906833,0.556042,0.295233,0.145561,1155.65625,1001.690141,454.65,320.575758
History,35.720721,16.350649,10.095238,7.126984,0.477171,0.159545,0.114698,0.101079,112.414414,88.675325,69.015873,46.412698
"Ecology, Evolution, Behavior and Systematics",124.878788,63.684783,40.231707,23.871795,1.492626,0.575076,0.357134,0.184308,480.070707,210.891304,152.597561,76.692308
Economics and Econometrics,113.03937,53.011494,29.229167,12.857143,3.441748,0.702667,0.331229,0.157029,327.795276,207.195402,130.604167,140.171429
Condensed Matter Physics,147.466667,93.075949,55.657534,27.54386,1.74784,0.582481,0.351151,0.177912,1605.626667,1041.101266,535.136986,514.263158
Education,79.598684,48.287879,28.064516,19.04,1.390322,0.531803,0.292581,0.14724,246.184211,210.606061,151.548387,81.28


In [10]:
def gini(s):
    vc = s.value_counts(normalize=True)
    diffs = np.abs(vc.values.reshape(-1, 1) - vc.values.reshape(1, -1))
    return diffs.sum() / (2 * vc.shape[0] ** 2 * vc.mean())

def top5(s):
    return s.value_counts(normalize=True).head(5).sum()

## Concentration metrics by fields

In [11]:
cat_base.groupby(["field", "Q"])[["country", "publisher"]].agg([gini, top5]).pipe(draw_table).set_caption("Concentration metrics by fields")

Unnamed: 0_level_0,country,country,country,country,country,country,country,country,publisher,publisher,publisher,publisher,publisher,publisher,publisher,publisher
Unnamed: 0_level_1,gini,gini,gini,gini,top5,top5,top5,top5,gini,gini,gini,gini,top5,top5,top5,top5
Q,Q1,Q2,Q3,Q4,Q1,Q2,Q3,Q4,Q1,Q2,Q3,Q4,Q1,Q2,Q3,Q4
field,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Medicine (miscellaneous),0.784429,0.771248,0.722648,0.63413,0.899743,0.821596,0.681818,0.472313,0.542272,0.549157,0.409589,0.192732,0.246787,0.215962,0.227273,0.114007
Sociology and Political Science,0.690355,0.678538,0.551724,0.458,0.989848,0.907216,0.689655,0.6,0.593788,0.432317,0.29443,0.108182,0.482234,0.412371,0.37931,0.22
Mechanical Engineering,0.558897,0.647783,0.595703,0.562092,0.973684,0.873563,0.796875,0.75,0.543344,0.413282,0.198342,0.079696,0.535088,0.425287,0.234375,0.147059
"Geography, Planning and Development",0.64898,0.729875,0.589444,0.536383,1.0,0.895349,0.706667,0.635135,0.554731,0.46852,0.294422,0.088745,0.642857,0.534884,0.306667,0.162162
Electrical and Electronic Engineering,0.489583,0.600939,0.557143,0.586453,1.0,0.943662,0.816667,0.772727,0.709491,0.425822,0.338542,0.069548,0.833333,0.464789,0.383333,0.151515
History,0.674389,0.614719,0.502924,0.507937,0.981982,0.805195,0.666667,0.730159,0.530346,0.347165,0.199546,0.112554,0.531532,0.324675,0.285714,0.174603
"Ecology, Evolution, Behavior and Systematics",0.66483,0.631884,0.560976,0.495192,0.939394,0.858696,0.682927,0.717949,0.528313,0.3811,0.232707,0.024966,0.545455,0.336957,0.256098,0.153846
Economics and Econometrics,0.301181,0.535304,0.607639,0.386555,1.0,0.977011,0.854167,0.6,0.526772,0.466749,0.314655,0.05368,0.543307,0.54023,0.416667,0.2
Condensed Matter Physics,0.586667,0.526899,0.610212,0.491228,0.933333,0.962025,0.90411,0.877193,0.482581,0.451008,0.41677,0.202951,0.546667,0.481013,0.452055,0.280702
Education,0.515789,0.647727,0.562212,0.445,1.0,0.954545,0.935484,0.88,0.603659,0.409091,0.256598,0.148571,0.559211,0.454545,0.451613,0.36
