统计数据下载情况

  数据下载好后需要统计下载的情况,这是python脚本。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
from obspy import read
from datetime import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.dates as mdates
from collections import defaultdict
from matplotlib.transforms import blended_transform_factory

# ===== 配置 =====
data_dir = "daily_waveforms" # 存放YYYYMMDD.mseed文件目录
station_file = "station.list" # 台站列表,格式:NET STA(两列)
components = ["LHZ", "LHN", "LHE"]
start_year = 2020
end_year = 2020

# ===== 读取台站列表 =====
stations = []
with open(station_file, "r") as f:
for line in f:
if line.strip():
net, sta = line.strip().split()[:2]
stations.append((net, sta))

station_set = set(f"{net}.{sta}" for net, sta in stations)

# ===== 初始化 =====
station_dates = {comp: defaultdict(set) for comp in components} # comp -> {net.sta -> set(date)}
station_file_sizes = defaultdict(int) # 台站对应所有文件大小
total_files = 0
total_file_size = 0

# ===== 遍历所有mseed文件 =====
all_files = sorted(f for f in os.listdir(data_dir) if f.endswith(".mseed"))
for filename in all_files:
filepath = os.path.join(data_dir, filename)

size = os.path.getsize(filepath)
total_files += 1
total_file_size += size

try:
st = read(filepath)
except Exception as e:
print(f"读取文件失败 {filename}: {e}")
continue

# 解析文件日期 YYYYMMDD
try:
file_date = dt.strptime(filename[:8], "%Y%m%d").date()
except Exception as e:
print(f"无法解析日期 {filename}: {e}")
continue

for tr in st:
net = tr.stats.network
sta = tr.stats.station
comp = tr.stats.channel[-3:]

key = f"{net}.{sta}"
if key not in station_set:
continue
if comp not in components:
continue

station_dates[comp][key].add(file_date)
station_file_sizes[key] += size

print(f"共计读取文件数: {total_files}")
print(f"所有文件总大小: {total_file_size / (1024**2):.2f} MB")

# ===== 删除无有效数据台站 =====
for comp in components:
keys_to_remove = [k for k, dates in station_dates[comp].items() if len(dates) == 0]
for k in keys_to_remove:
del station_dates[comp][k]

# ===== 输出有效天数统计文本 =====
for comp in components:
with open(f"station_day_count_{comp}.txt", "w") as f:
for key in sorted(station_dates[comp].keys()):
f.write(f"{key} {len(station_dates[comp][key])}\n")

# ===== 台网颜色映射 =====
all_nets = sorted(set(net for net, sta in stations))
net_color_map = {net: cm.get_cmap("tab20")(i / max(len(all_nets)-1,1)) for i, net in enumerate(all_nets)}

# ===== 绘图函数 =====
def plot_component_availability(comp, station_dates_comp):
valid_stations = sorted(station_dates_comp.keys(), key=lambda x: (x.split(".")[0], x.split(".")[1]))
fig, ax = plt.subplots(figsize=(14, max(6, 0.3 * len(valid_stations))))

def to_datetime(d):
return dt(d.year, d.month, d.day)

for idx, key in enumerate(valid_stations):
net = key.split(".")[0]
dates = sorted(station_dates_comp[key])
x_vals = [to_datetime(d) for d in dates]
ax.plot(x_vals, [idx]*len(dates), ".", color=net_color_map[net], markersize=2)

ax.set_xlabel("时间")
ax.set_ylabel("台站")
ax.set_yticks(range(len(valid_stations)))
ax.set_yticklabels(valid_stations, fontsize=5)

ax.grid(True, linestyle=":", alpha=0.5)

x_start = dt(start_year, 1, 1)
x_end = dt(end_year, 12, 31)
ax.set_xlim(x_start, x_end)

ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

plt.tight_layout(rect=[0, 0, 0.85, 1])

# 右侧外部标注台网名
trans = blended_transform_factory(ax.transAxes, ax.transData)
net_indices = defaultdict(list)
for idx, key in enumerate(valid_stations):
net = key.split(".")[0]
net_indices[net].append(idx)

for net, indices in net_indices.items():
mid_idx = indices[len(indices)//2]
color = net_color_map[net]
ax.text(1.02, mid_idx, net, color=color, fontsize=8, fontweight='bold',
verticalalignment='center', horizontalalignment='left',
transform=trans)

plt.savefig(f"{comp}_availability_2013_2022.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"已保存图像: {comp}_availability_2013_2022.png")

# ===== 画三分量图 =====
for comp in components:
plot_component_availability(comp, station_dates[comp])

# ===== 台网统计 =====
network_stats = defaultdict(lambda: {"total": 0, "valid": 0, "day_counts": []})

for net, sta in stations:
network_stats[net]["total"] += 1

all_valid_keys = set()
for comp in components:
all_valid_keys.update(station_dates[comp].keys())

for key in all_valid_keys:
net = key.split(".")[0]
days_list = [len(station_dates[comp].get(key, [])) for comp in components]
valid_days = max(days_list)
network_stats[net]["valid"] += 1
network_stats[net]["day_counts"].append(valid_days)

print("\n台网统计汇总:")
print(f"{'Net':<6} {'Total':>6} {'Valid':>6} {'AvgDays':>10}")
print("-" * 32)
with open("network_summary_all_components.txt", "w") as f:
f.write(f"{'Net':<6} {'Total':>6} {'Valid':>6} {'AvgDays':>10}\n")
for net in sorted(network_stats.keys()):
total = network_stats[net]["total"]
valid = network_stats[net]["valid"]
avg_days = np.mean(network_stats[net]["day_counts"]) if network_stats[net]["day_counts"] else 0
line = f"{net:<6} {total:>6} {valid:>6} {avg_days:>10.1f}"
print(line)
f.write(line + "\n")

# ===== 总结 =====
print(f"\n文件总数: {total_files}")
print(f"文件总大小: {total_file_size/(1024**2):.2f} MB ({total_file_size/(1024**3):.2f} GB)")

  这个脚本实现的功能包括:

  • 统计开始时间start_year到结束时间end_year的数据下载情况
  • 对LHZ、LHN和LHE分别进行统计。
  • 对统计结果进行可视化输出。
  • 横轴是时间,纵轴是台站名。不同台网用不同颜色标出。在图右侧标出台网名。
  • 输出文件network_summary_all_components.txt,给出台网包含台站数目,有效台站数目和各自平均天数。
  • 输出文件station_day_count_{comp}.txt,给出台站名和有效天数。
  • 输出文件总数,文件总大小。