In [39]:
import pandas as pd
import glob

# 1. 读取 cq 和 rf 的公司数据
cq_company = pd.read_csv(r'../bishi/m_source/cq/company.csv')
rf_company = pd.read_csv(r'../bishi/m_source/rf/company.csv')

# 2. 筛选美国股票(NASDAQ 和 NYSE)
cq_us_stocks = cq_company[cq_company['iq_company_ticker'].str.contains("NYSE|NASDAQ")]
rf_us_stocks = rf_company[rf_company['RIC'].str.contains("\.O|\.N")]
In [40]:
# cq_us_stocks
rf_us_stocks
Out[40]:
BusinessEntity DocumentTitle PermID PI RIC eq_type
0 QUOTExEQUITY Campbell Soup Co, Ordinary Share, NASDAQ Globa... 55839036104 1096879 CPB.O Ordinary Share
2 QUOTExEQUITY Campbell Soup Co, Ordinary Share, NASDAQ Stock... 55838320971 1089759 CPB.OQ Ordinary Share
14 QUOTExEQUITY Southtrust Corp Common Stock, Ordinary Share, ... 21475175232 733571 SOTR.O^K04 Ordinary Share
15 QUOTExEQUITY Southtrust Corp Common Stock, Ordinary Share, ... 21475175242 7878331 SOTR.OQ^K04 Ordinary Share
27 QUOTExEQUITY Sprott Inc, Ordinary Share, New York Stock Exc... 25727345541 444327127 SII.N Ordinary Share
... ... ... ... ... ... ...
4275 QUOTExEQUITY Sherwin-Williams Co, Ordinary Share, New York ... 55838325623 1091275 SHW.N Ordinary Share
4281 QUOTExEQUITY Resmed Inc, Ordinary Share, New York Stock Exc... 55838325403 1095013 RMD.N Ordinary Share
4287 QUOTExEQUITY Discover Financial Services, Ordinary Share, N... 55838321355 34287585 DFS.N Ordinary Share
4290 QUOTExEQUITY First Solar Inc, Ordinary Share, NASDAQ Global... 55839118368 29192693 FSLR.O Ordinary Share
4291 QUOTExEQUITY First Solar Inc, Ordinary Share, NASDAQ Stock ... 55835327909 29192694 FSLR.OQ Ordinary Share

834 rows × 6 columns

In [41]:
print(len(cq_company))
print(len(rf_company))
print("选择后")
print(len(cq_us_stocks))
print(len(rf_us_stocks))
915
4293
选择后
656
834
In [42]:
# 3. 读取 cq 和 rf 的价格数据

# 定义文件路径
cq_price_files = glob.glob(r'../bishi/m_source/cq/price/*.csv')
rf_price_files = glob.glob(r'../bishi/m_source/rf/price/*.csv')

# 读取所有 cq 价格数据文件并合并
cq_price_list = []
for file in cq_price_files:
    # 从文件名获取 ticker (即股票代码),去掉路径和扩展名
    ticker = file.split('\\')[-1].split('.')[0]

    # 只读取与美国市场相关的股票数据
    if ticker in cq_us_stocks['iq_cid'].values:
        df = pd.read_csv(file)
        df['ticker'] = ticker  # 添加 ticker 列,表示这条数据属于哪个股票
        df['source'] = 'cq'  # 标记数据来源
        cq_price_list.append(df)

# 如果没有相关股票数据,退出
if not cq_price_list:
    print("No relevant CQ price data found for US stocks.")
else:
    cq_price = pd.concat(cq_price_list, ignore_index=True)
    print(f'cq_price shape: {cq_price.shape}')

# 读取所有 rf 价格数据文件并合并
rf_price_list = []
for file in rf_price_files:
    # 从文件名获取 RIC(即股票代码),去掉路径和扩展名
    ric = file.split('\\')[-1].strip('.csv')
    # 只读取与美国市场相关的股票数据
    if ric in rf_us_stocks['RIC'].values:
        df = pd.read_csv(file)
        df['RIC'] = ric  # 添加 RIC 列,表示这条数据属于哪个股票
        df['source'] = 'rf'  # 标记数据来源
        rf_price_list.append(df)

# 如果没有相关股票数据,退出
if not rf_price_list:
    print("No relevant RF price data found for US stocks.")
else:
    rf_price = pd.concat(rf_price_list, ignore_index=True)
    print(f'rf_price shape: {rf_price.shape}')
cq_price shape: (3999208, 9)
rf_price shape: (4246402, 9)
In [43]:
cq_price
Out[43]:
date open high low close close_adj volume ticker source
0 2009-02-11 26.00 27.0700 25.900 26.43 22.73362 21.74218 IQ1002291 cq
1 2009-02-12 26.36 27.1900 26.000 27.16 23.36153 3.97082 IQ1002291 cq
2 2009-02-13 26.77 27.1900 26.670 26.81 23.06048 1.06470 IQ1002291 cq
3 2009-02-17 26.44 27.6300 26.300 27.26 23.44754 1.26460 IQ1002291 cq
4 2009-02-18 27.63 28.2200 27.200 27.95 24.04104 1.46571 IQ1002291 cq
... ... ... ... ... ... ... ... ... ...
3999203 2024-09-09 77.33 77.9900 76.795 77.89 77.89000 0.83088 IQ997111 cq
3999204 2024-09-10 78.14 78.8108 77.710 78.68 78.68000 0.55670 IQ997111 cq
3999205 2024-09-11 78.48 78.5700 76.630 78.46 78.46000 0.59314 IQ997111 cq
3999206 2024-09-12 78.40 79.4200 77.875 79.40 79.40000 0.98223 IQ997111 cq
3999207 2024-09-13 79.75 80.5900 79.420 79.97 79.97000 0.98616 IQ997111 cq

3999208 rows × 9 columns

In [44]:
rf_price
Out[44]:
date open high low close close_adj volume RIC source
0 2015-12-17 NaN NaN NaN NaN NaN NaN 3419.NG rf
1 2015-12-18 614.00 664.00 514.00 NaN NaN 393500.0 3419.NG rf
2 2015-12-21 506.00 516.00 444.00 NaN NaN 53100.0 3419.NG rf
3 2015-12-22 498.00 499.00 460.00 NaN NaN 19800.0 3419.NG rf
4 2015-12-24 477.00 541.00 472.00 NaN NaN 45200.0 3419.NG rf
... ... ... ... ... ... ... ... ... ...
4246397 2024-09-06 187.46 190.44 187.10 189.23 189.23 717886.0 ZTS.N rf
4246398 2024-09-09 190.55 193.57 190.23 191.73 191.73 696518.0 ZTS.N rf
4246399 2024-09-10 191.73 192.74 187.82 191.36 191.36 642446.0 ZTS.N rf
4246400 2024-09-11 190.71 190.71 186.60 189.99 189.99 538859.0 ZTS.N rf
4246401 2024-09-12 189.98 190.84 187.23 190.71 190.71 411595.0 ZTS.N rf

4246402 rows × 9 columns

In [45]:
# 输出数据维度以检查读取的文件
print(f'cq_price shape: {cq_price.shape}')
print(f'rf_price shape: {rf_price.shape}')
cq_price shape: (3999208, 9)
rf_price shape: (4246402, 9)
In [46]:
cq_us_stocks['iq_company_name']
Out[46]:
0             Campbell Soup Company
1                 WestRock MWV, LLC
2                       El Paso LLC
4          Smith International Inc.
5             Cooper Industries plc
                   ...             
908                 Deere & Company
910      Iron Mountain Incorporated
911    The Sherwin-Williams Company
912                     ResMed Inc.
913     Discover Financial Services
Name: iq_company_name, Length: 656, dtype: object
In [48]:
# 通过字符串分割提取公司名称 (在逗号前的部分)
# 使用 .loc 显式赋值来避免警告
rf_us_stocks.loc[:, 'company_name'] = rf_us_stocks['DocumentTitle'].str.split(',').str[0]

rf_us_stocks['company_name']
Out[48]:
0                   Campbell Soup Co
2                   Campbell Soup Co
14      Southtrust Corp Common Stock
15      Southtrust Corp Common Stock
27                        Sprott Inc
                    ...             
4275             Sherwin-Williams Co
4281                      Resmed Inc
4287     Discover Financial Services
4290                 First Solar Inc
4291                 First Solar Inc
Name: company_name, Length: 834, dtype: object
In [49]:
# 在 Jupyter Notebook 或 Python 环境中,不能直接使用 pip install 语句。你可以使用以下方法在 Jupyter Notebook 中安装库:
!pip install rapidfuzz --proxy=http://127.0.0.1:10809
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: rapidfuzz in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (3.9.7)
In [52]:
# 4. 关联数据 - 根据公司名称进行匹配
# 创建一个映射,使用公司名称和股票代码(ticker)来匹配两家数据源中的公司
import pandas as pd
from rapidfuzz import process, fuzz
# 提取需要的列,确保清理数据(去空格,统一大小写)
# 对 rf_us_stocks 使用 .loc 来避免警告
rf_us_stocks.loc[:, 'company_name'] = rf_us_stocks['DocumentTitle'].str.split(',').str[0].str.strip().str.lower()

# 对 cq_us_stocks 使用 .loc 来避免警告
cq_us_stocks.loc[:, 'iq_company_name'] = cq_us_stocks['iq_company_name'].str.strip().str.lower()


# 模糊匹配
def fuzzy_merge(left, right, left_on, right_on, threshold=80, limit=1):
    """
    Perform a fuzzy merge between two dataframes.
    
    :param left: left dataframe
    :param right: right dataframe
    :param left_on: column name in the left dataframe
    :param right_on: column name in the right dataframe
    :param threshold: fuzz ratio threshold to consider a match
    :param limit: number of matches per row
    :return: merged dataframe with matched rows
    """
    # Fuzzy matching between two columns
    s = right[right_on].tolist()
    matches = []
    for item in left[left_on]:
        best_matches = process.extract(item, s, scorer=fuzz.token_sort_ratio, limit=limit)
        if best_matches and best_matches[0][1] >= threshold:
            matches.append(best_matches[0][0])
        else:
            matches.append(None)
    
    left.loc[:, 'best_match'] = matches
    return pd.merge(left, right, left_on='best_match', right_on=right_on, how='inner')

# 使用模糊匹配进行数据合并
merged_df = fuzzy_merge(rf_us_stocks, cq_us_stocks, 'company_name', 'iq_company_name')

# 打印合并后的数据
print(merged_df[['DocumentTitle', 'iq_company_name', 'company_name']])
                                         DocumentTitle  \
0    Campbell Soup Co, Ordinary Share, NASDAQ Globa...   
1    Campbell Soup Co, Ordinary Share, NASDAQ Stock...   
2    CenterPoint Energy Inc, Ordinary Share, New Yo...   
3    Pentair PLC, Ordinary Share, New York Stock Ex...   
4    Kansas City Southern Ord Shs, Ordinary Share, ...   
..                                                 ...   
151  Under Armour Inc, Ordinary Share, Class A, New...   
152  Under Armour Inc, Ordinary Share, Class C, New...   
153  Sherwin-Williams Co, Ordinary Share, New York ...   
154  Resmed Inc, Ordinary Share, New York Stock Exc...   
155  Discover Financial Services, Ordinary Share, N...   

                  iq_company_name                  company_name  
0           campbell soup company              campbell soup co  
1           campbell soup company              campbell soup co  
2        centerpoint energy, inc.        centerpoint energy inc  
3                     pentair plc                   pentair plc  
4            kansas city southern  kansas city southern ord shs  
..                            ...                           ...  
151            under armour, inc.              under armour inc  
152            under armour, inc.              under armour inc  
153  the sherwin-williams company           sherwin-williams co  
154                   resmed inc.                    resmed inc  
155   discover financial services   discover financial services  

[156 rows x 3 columns]
In [53]:
# 统计每一列缺失值的数量
missing_counts = merged_df.isna().sum()
print(missing_counts)
BusinessEntity            0
DocumentTitle             0
PermID                    0
PI                        0
RIC                       0
eq_type                   0
company_name              0
best_match                0
iq_company_ticker         0
iq_cid                    0
iq_company_name           0
iq_native_company_name    0
iq_company_type           0
dtype: int64
In [54]:
display(merged_df)
BusinessEntity DocumentTitle PermID PI RIC eq_type company_name best_match iq_company_ticker iq_cid iq_company_name iq_native_company_name iq_company_type
0 QUOTExEQUITY Campbell Soup Co, Ordinary Share, NASDAQ Globa... 55839036104 1096879 CPB.O Ordinary Share campbell soup co campbell soup company NYSE:CPB IQ258458 campbell soup company Campbell Soup Company Public Company
1 QUOTExEQUITY Campbell Soup Co, Ordinary Share, NASDAQ Stock... 55838320971 1089759 CPB.OQ Ordinary Share campbell soup co campbell soup company NYSE:CPB IQ258458 campbell soup company Campbell Soup Company Public Company
2 QUOTExEQUITY CenterPoint Energy Inc, Ordinary Share, New Yo... 55838320929 9630878 CNP.N Ordinary Share centerpoint energy inc centerpoint energy, inc. NYSE:CNP IQ279513 centerpoint energy, inc. CenterPoint Energy, Inc. Public Company
3 QUOTExEQUITY Pentair PLC, Ordinary Share, New York Stock Ex... 55838324999 1091134 PNR.N Ordinary Share pentair plc pentair plc NYSE:PNR IQ295688 pentair plc Pentair plc Public Company
4 QUOTExEQUITY Kansas City Southern Ord Shs, Ordinary Share, ... 55838323699 1090524 KSU.N^L21 Ordinary Share kansas city southern ord shs kansas city southern NYSE:KSU IQ30472 kansas city southern Kansas City Southern Private Company
... ... ... ... ... ... ... ... ... ... ... ... ... ...
151 QUOTExEQUITY Under Armour Inc, Ordinary Share, Class A, New... 55838326336 23269251 UAA.N Ordinary Share under armour inc under armour, inc. NYSE:UAA IQ8740684 under armour, inc. Under Armour, Inc. Public Company
152 QUOTExEQUITY Under Armour Inc, Ordinary Share, Class C, New... 21584809535 222164648 UA.N Ordinary Share under armour inc under armour, inc. NYSE:UAA IQ8740684 under armour, inc. Under Armour, Inc. Public Company
153 QUOTExEQUITY Sherwin-Williams Co, Ordinary Share, New York ... 55838325623 1091275 SHW.N Ordinary Share sherwin-williams co the sherwin-williams company NYSE:SHW IQ303104 the sherwin-williams company The Sherwin-Williams Company Public Company
154 QUOTExEQUITY Resmed Inc, Ordinary Share, New York Stock Exc... 55838325403 1095013 RMD.N Ordinary Share resmed inc resmed inc. NYSE:RMD IQ340196 resmed inc. ResMed Inc. Public Company
155 QUOTExEQUITY Discover Financial Services, Ordinary Share, N... 55838321355 34287585 DFS.N Ordinary Share discover financial services discover financial services NYSE:DFS IQ3695566 discover financial services Discover Financial Services Public Company

156 rows × 13 columns

In [55]:
rf_price
Out[55]:
date open high low close close_adj volume RIC source
0 2015-12-17 NaN NaN NaN NaN NaN NaN 3419.NG rf
1 2015-12-18 614.00 664.00 514.00 NaN NaN 393500.0 3419.NG rf
2 2015-12-21 506.00 516.00 444.00 NaN NaN 53100.0 3419.NG rf
3 2015-12-22 498.00 499.00 460.00 NaN NaN 19800.0 3419.NG rf
4 2015-12-24 477.00 541.00 472.00 NaN NaN 45200.0 3419.NG rf
... ... ... ... ... ... ... ... ... ...
4246397 2024-09-06 187.46 190.44 187.10 189.23 189.23 717886.0 ZTS.N rf
4246398 2024-09-09 190.55 193.57 190.23 191.73 191.73 696518.0 ZTS.N rf
4246399 2024-09-10 191.73 192.74 187.82 191.36 191.36 642446.0 ZTS.N rf
4246400 2024-09-11 190.71 190.71 186.60 189.99 189.99 538859.0 ZTS.N rf
4246401 2024-09-12 189.98 190.84 187.23 190.71 190.71 411595.0 ZTS.N rf

4246402 rows × 9 columns

In [56]:
# 假设你已经加载了两个数据源的价格数据
# 使用模糊匹配合并的数据
merged_data_list = []

for _, row in merged_df.iterrows():
    cq_stock_ticker = row['iq_cid']
    rf_stock_rich = row['RIC']

    # 获取 CQ 和 RF 数据源中的价格数据
    cq_stock_data = cq_price[cq_price['ticker'] == cq_stock_ticker]
    rf_stock_data = rf_price[rf_price['RIC'] == rf_stock_rich]

    # 如果其中一个数据源数据缺失,使用另一个数据源的进行填补
    merged_stock_data = pd.merge(cq_stock_data, rf_stock_data, on='date', suffixes=('_cq', '_rf'))

    # 对每个数据源的列进行交叉检验和修复
    merged_stock_data['close'] = merged_stock_data['close_cq'].combine_first(merged_stock_data['close_rf'])
    merged_stock_data['open'] = merged_stock_data['open_cq'].combine_first(merged_stock_data['open_rf'])
    merged_stock_data['high'] = merged_stock_data['high_cq'].combine_first(merged_stock_data['high_rf'])
    merged_stock_data['low'] = merged_stock_data['low_cq'].combine_first(merged_stock_data['low_rf'])
    merged_stock_data['volume'] = merged_stock_data['volume_cq'].combine_first(merged_stock_data['volume_rf'])
    
    # 计算复权因子:adj_factor = close_adj / close, 初始 adj_factor 为 1
    merged_stock_data['adj_factor'] = merged_stock_data['close_adj_cq'].combine_first(merged_stock_data['close_adj_rf']) / merged_stock_data['close']
    merged_stock_data['adj_factor'].fillna(1, inplace=True)  # 填补任何缺失值为 1

    # 合并到结果列表
    merged_data_list.append(merged_stock_data)

# 合并所有的股票数据
final_merged_data = pd.concat(merged_data_list, ignore_index=True)

# 输出合并后的数据
final_merged_data = final_merged_data[['date', 'open', 'high', 'low', 'close', 'volume', 'adj_factor']]
print(final_merged_data.head())
         date  open      high       low     close  volume  adj_factor
0  1990-01-02   0.0  14.50000  13.50000  13.81250  2.8060    0.404277
1  1990-01-03   0.0  13.75000  13.31250  13.56250  1.9340    0.406115
2  1990-01-04   0.0  13.59375  13.06250  13.31250  1.4696    0.406115
3  1990-01-05   0.0  13.18750  12.81250  12.90625  1.9272    0.406115
4  1990-01-08   0.0  12.81250  12.46875  12.68750  1.5336    0.406115
In [58]:
final_merged_data
Out[58]:
date open high low close volume adj_factor
0 1990-01-02 0.00 14.50000 13.50000 13.81250 2.80600 0.404277
1 1990-01-03 0.00 13.75000 13.31250 13.56250 1.93400 0.406115
2 1990-01-04 0.00 13.59375 13.06250 13.31250 1.46960 0.406115
3 1990-01-05 0.00 13.18750 12.81250 12.90625 1.92720 0.406115
4 1990-01-08 0.00 12.81250 12.46875 12.68750 1.53360 0.406115
... ... ... ... ... ... ... ...
907264 2024-09-06 132.45 135.05000 128.36000 128.68000 1.00826 1.000000
907265 2024-09-09 129.57 134.84000 129.57000 133.33000 1.08438 1.000000
907266 2024-09-10 132.66 132.66000 123.73500 130.34000 2.61948 1.000000
907267 2024-09-11 127.31 128.93000 124.47000 128.68000 1.33610 1.000000
907268 2024-09-12 128.68 129.53000 126.19000 129.26000 0.78801 1.000000

907269 rows × 7 columns

In [59]:
!pip install matplotlib  --proxy=http://127.0.0.1:10809
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: matplotlib in c:\users\misin\appdata\roaming\python\python38\site-packages (3.7.5)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (1.1.1)
Requirement already satisfied: cycler>=0.10 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (4.54.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: numpy<2,>=1.20 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (1.24.4)
Requirement already satisfied: packaging>=20.0 in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\misin\appdata\roaming\python\python38\site-packages (from matplotlib) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (from matplotlib) (2.9.0)
Requirement already satisfied: importlib-resources>=3.2.0 in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (from matplotlib) (6.4.5)
Requirement already satisfied: zipp>=3.1.0 in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (from importlib-resources>=3.2.0->matplotlib) (3.20.2)
Requirement already satisfied: six>=1.5 in d:\software_tools\anaconda\envs\liangshu\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
In [60]:
import seaborn as sns
import matplotlib.pyplot as plt

# 数据缺失情况统计
missing_data = final_merged_data.isna().sum()
print("缺失数据统计:\n", missing_data)

# 描述统计分析
print("描述性统计分析:\n", final_merged_data.describe())


# 设置字体为 SimHei,确保可以显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定中文字体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号问题


# 绘制复权因子分布图
plt.figure(figsize=(12, 6))
sns.histplot(final_merged_data['adj_factor'], bins=30, kde=True)
plt.title("复权因子分布图 (adj_factor)")
plt.xlabel("复权因子")
plt.ylabel("频次")
plt.show()

# 绘制复权因子的箱型图
sns.boxplot(x=final_merged_data['adj_factor'])
plt.title("复权因子箱型图")
plt.xlabel("复权因子")
plt.show()
缺失数据统计:
 date           0
open          75
high          50
low           44
close         61
volume        47
adj_factor     0
dtype: int64
描述性统计分析:
                 open           high            low          close  \
count  907194.000000  907219.000000  907225.000000  907208.000000   
mean       86.056477      87.507456      85.315611      86.433799   
std       257.188780     260.431421     253.781122     257.119068   
min         0.000000       0.000000       0.000000       0.049000   
25%        17.880000      18.400000      17.870000      18.150000   
50%        35.790000      36.321640      35.430000      35.906250   
75%        69.970000      70.930000      69.420000      70.190000   
max      9214.330000    9352.500000    9085.000000    9321.270000   

             volume     adj_factor  
count  9.072220e+05  907269.000000  
mean   3.513975e+04       1.294441  
std    3.716116e+05      39.541562  
min    0.000000e+00       0.000944  
25%    4.862000e-01       0.595626  
50%    1.243870e+00       0.817487  
75%    3.009680e+00       0.955260  
max    1.510355e+08    3456.000000  
No description has been provided for this image
No description has been provided for this image
In [61]:
!pip install scikit-learn --proxy=http://127.0.0.1:10809
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: scikit-learn in c:\users\misin\appdata\roaming\python\python38\site-packages (1.3.2)
Requirement already satisfied: numpy<2.0,>=1.17.3 in c:\users\misin\appdata\roaming\python\python38\site-packages (from scikit-learn) (1.24.4)
Requirement already satisfied: scipy>=1.5.0 in c:\users\misin\appdata\roaming\python\python38\site-packages (from scikit-learn) (1.10.1)
Requirement already satisfied: joblib>=1.1.1 in c:\users\misin\appdata\roaming\python\python38\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\misin\appdata\roaming\python\python38\site-packages (from scikit-learn) (3.5.0)
In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 假设你已经有了合并后的数据集和相关数据:
# merged_df: 合并后的数据框,其中包含 'DocumentTitle', 'iq_company_name', 'company_name', 'date', 'close_cq', 'close_rf', 'adj_factor_cq', 'adj_factor_rf' 等字段

# 1. 公司名称匹配准确率
def company_name_matching_accuracy(merged_df):
    # 计算匹配的公司数量
    matched_companies = merged_df[merged_df['iq_company_name'] == merged_df['company_name']].shape[0]
    total_companies = merged_df.shape[0]
    return matched_companies / total_companies

# 2. 日期匹配率
def date_matching_accuracy(cq_price, rf_price):
    # 计算两个数据源的共同日期
    common_dates = pd.merge(cq_price[['date', 'ticker']], rf_price[['date', 'RIC']], on='date', how='inner')
    total_dates = len(cq_price['date'].unique())  # 使用cq数据的日期数作为总日期数
    return len(common_dates) / total_dates

# 3. 价格修复一致性(通过均方误差 MSE 计算)
def price_repair_consistency(cq_price, rf_price, date_col='date', price_col='close'):
    # 将两个数据源按日期合并
    merged_prices = pd.merge(cq_price[[date_col, price_col]], rf_price[[date_col, price_col]], on=date_col, how='inner', suffixes=('_cq', '_rf'))
    # 计算 MSE
    mse = mean_squared_error(merged_prices[f'{price_col}_cq'], merged_prices[f'{price_col}_rf'])
    return mse

# 4. 复权因子一致性
def adj_factor_consistency(cq_price, rf_price, date_col='date', adj_factor_col='adj_factor'):
    # 将两个数据源按日期合并
    merged_factors = pd.merge(cq_price[[date_col, adj_factor_col]], rf_price[[date_col, adj_factor_col]], on=date_col, how='inner', suffixes=('_cq', '_rf'))
    # 计算一致性误差
    adj_factor_diff = np.abs(merged_factors[f'{adj_factor_col}_cq'] - merged_factors[f'{adj_factor_col}_rf'])
    return adj_factor_diff.mean()  # 返回平均误差

# 5. 综合可信度计算
def calculate_overall_confidence(company_accuracy, date_accuracy, price_mse, adj_factor_error, company_weight=0.3, date_weight=0.2, price_weight=0.3, adj_factor_weight=0.2):
    # 将 MSE 和 adj_factor_error 转化为一致性评分
    price_consistency = 1 - price_mse  # 通过 MSE 转化为一致性(MSE 越小,可信度越高)
    adj_factor_consistency = 1 - adj_factor_error  # 通过平均误差转化为一致性(误差越小,可信度越高)

    # 综合可信度评分
    overall_confidence = (company_weight * company_accuracy + 
                          date_weight * date_accuracy + 
                          price_weight * price_consistency + 
                          adj_factor_weight * adj_factor_consistency)
    
    return overall_confidence

# 假设 merged_df 是合并后的数据框
# merged_df['close_cq'], merged_df['close_rf'] 为价格列
# merged_df['adj_factor_cq'], merged_df['adj_factor_rf'] 为复权因子列

# 示例:计算公司名称匹配准确率
company_accuracy = company_name_matching_accuracy(merged_df)
print(f'公司名称匹配准确率: {company_accuracy * 100:.2f}%')
公司名称匹配准确率: 10.90%
In [74]:
date_accuracy = date_matching_accuracy(cq_price, rf_price)
print(f'日期匹配率: {date_accuracy * 100:.2f}%')
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Cell In[74], line 1
----> 1 date_accuracy = date_matching_accuracy(cq_price, rf_price)
      2 print(f'日期匹配率: {date_accuracy * 100:.2f}%')

Cell In[72], line 18, in date_matching_accuracy(cq_price, rf_price)
     16 def date_matching_accuracy(cq_price, rf_price):
     17     # 计算两个数据源的共同日期
---> 18     common_dates = pd.merge(cq_price[['date', 'ticker']], rf_price[['date', 'RIC']], on='date', how='inner')
     19     total_dates = len(cq_price['date'].unique())  # 使用cq数据的日期数作为总日期数
     20     return len(common_dates) / total_dates

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:162, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    131 @Substitution("\nleft : DataFrame or named Series")
    132 @Appender(_merge_doc, indents=0)
    133 def merge(
   (...)
    146     validate: str | None = None,
    147 ) -> DataFrame:
    148     op = _MergeOperation(
    149         left,
    150         right,
   (...)
    160         validate=validate,
    161     )
--> 162     return op.get_result(copy=copy)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:809, in _MergeOperation.get_result(self, copy)
    806 if self.indicator:
    807     self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 809 join_index, left_indexer, right_indexer = self._get_join_info()
    811 result = self._reindex_and_concat(
    812     join_index, left_indexer, right_indexer, copy=copy
    813 )
    814 result = result.__finalize__(self, method=self._merge_type)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1065, in _MergeOperation._get_join_info(self)
   1061     join_index, right_indexer, left_indexer = _left_join_on_index(
   1062         right_ax, left_ax, self.right_join_keys, sort=self.sort
   1063     )
   1064 else:
-> 1065     (left_indexer, right_indexer) = self._get_join_indexers()
   1067     if self.right_index:
   1068         if len(self.left) > 0:

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1038, in _MergeOperation._get_join_indexers(self)
   1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
   1037     """return the join indexers"""
-> 1038     return get_join_indexers(
   1039         self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
   1040     )

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1690, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1680 join_func = {
   1681     "inner": libjoin.inner_join,
   1682     "left": libjoin.left_outer_join,
   (...)
   1686     "outer": libjoin.full_outer_join,
   1687 }[how]
   1689 # error: Cannot call function of unknown type
-> 1690 return join_func(lkey, rkey, count, **kwargs)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\_libs\join.pyx:48, in pandas._libs.join.inner_join()

MemoryError: Unable to allocate 14.6 GiB for an array with shape (1953709328,) and data type int64
In [79]:
# 示例:计算价格修复一致性
price_mse = price_repair_consistency(cq_price, rf_price)
print(f'价格修复MSE: {price_mse:.4f}')

# 示例:计算复权因子一致性
adj_factor_error = adj_factor_consistency(cq_price, rf_price)
print(f'复权因子一致性误差: {adj_factor_error:.4f}')


# 计算整体可信度
confidence = calculate_overall_confidence(company_accuracy, date_accuracy, price_mse, adj_factor_error)
print(f'整体可信度评分: {confidence * 100:.2f}%')
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Cell In[79], line 2
      1 # 示例:计算价格修复一致性
----> 2 price_mse = price_repair_consistency(cq_price, rf_price)
      3 print(f'价格修复MSE: {price_mse:.4f}')
      5 # 示例:计算复权因子一致性

Cell In[72], line 25, in price_repair_consistency(cq_price, rf_price, date_col, price_col)
     23 def price_repair_consistency(cq_price, rf_price, date_col='date', price_col='close'):
     24     # 将两个数据源按日期合并
---> 25     merged_prices = pd.merge(cq_price[[date_col, price_col]], rf_price[[date_col, price_col]], on=date_col, how='inner', suffixes=('_cq', '_rf'))
     26     # 计算 MSE
     27     mse = mean_squared_error(merged_prices[f'{price_col}_cq'], merged_prices[f'{price_col}_rf'])

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:162, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    131 @Substitution("\nleft : DataFrame or named Series")
    132 @Appender(_merge_doc, indents=0)
    133 def merge(
   (...)
    146     validate: str | None = None,
    147 ) -> DataFrame:
    148     op = _MergeOperation(
    149         left,
    150         right,
   (...)
    160         validate=validate,
    161     )
--> 162     return op.get_result(copy=copy)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:809, in _MergeOperation.get_result(self, copy)
    806 if self.indicator:
    807     self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 809 join_index, left_indexer, right_indexer = self._get_join_info()
    811 result = self._reindex_and_concat(
    812     join_index, left_indexer, right_indexer, copy=copy
    813 )
    814 result = result.__finalize__(self, method=self._merge_type)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1065, in _MergeOperation._get_join_info(self)
   1061     join_index, right_indexer, left_indexer = _left_join_on_index(
   1062         right_ax, left_ax, self.right_join_keys, sort=self.sort
   1063     )
   1064 else:
-> 1065     (left_indexer, right_indexer) = self._get_join_indexers()
   1067     if self.right_index:
   1068         if len(self.left) > 0:

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1038, in _MergeOperation._get_join_indexers(self)
   1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
   1037     """return the join indexers"""
-> 1038     return get_join_indexers(
   1039         self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
   1040     )

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\core\reshape\merge.py:1690, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1680 join_func = {
   1681     "inner": libjoin.inner_join,
   1682     "left": libjoin.left_outer_join,
   (...)
   1686     "outer": libjoin.full_outer_join,
   1687 }[how]
   1689 # error: Cannot call function of unknown type
-> 1690 return join_func(lkey, rkey, count, **kwargs)

File ~\AppData\Roaming\Python\Python38\site-packages\pandas\_libs\join.pyx:48, in pandas._libs.join.inner_join()

MemoryError: Unable to allocate 14.6 GiB for an array with shape (1953709328,) and data type int64