CasNote (6)试验数据-SEGY数据读写

表设计

用python在cassandra命名空间seismic_keyspace里创建project_chunks_table表，表里的字段有project，sensor，trace，chunk，其中project为文本类型，sensor、trace为整数类型，chunk为二进制类型，大小设置为255K。以project和sensor为分区键，trace字段为Clustering Key。

写数据

将一个大小为1.96GB的segy数据，将文件分割为合适大小的 "chunks"，每个 "chunk" 大小不超过 255 KB；在表project_chunks_table，chunk字段用来存储“chunk”，project字段值为全为proj_001，sensor为1，trace则按chunk的数量从1开始递增，请具体实现一下


import os
import math
from cassandra.cluster import Cluster

# 连接到Cassandra集群
cassandra_host = ['192.168.92.159', '192.168.92.160','192.168.92.161']
cluster = Cluster(cassandra_host)  # 替换为实际的Cassandra主机地址
session = cluster.connect()

# 指定要处理的keyspace和table名称
keyspace_name = 'seismic_keyspace'
table_name = 'project_chunks_table'

# 切换到指定keyspace
session.set_keyspace(keyspace_name)

# 创建表的CQL查询语句，如果表不存在
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        project TEXT,
        sensor INT,
        trace INT,
        chunk BLOB,
        PRIMARY KEY ((project, sensor), trace)
    )
"""
session.execute(create_table_query)

# 分割SEGY文件并插入数据到表中
segy_file_path = '.\data\LX_SEGY005.segy'
chunk_size = 255 * 1024  # 255 KB in bytes
project = 'proj_001'
sensor = 1
trace = 1

with open(segy_file_path, 'rb') as segy_file:
    while True:
        chunk_data = segy_file.read(chunk_size)
        if not chunk_data:
            break
        
        # 插入数据到表中
        insert_query = f"INSERT INTO {table_name} (project, sensor, trace, chunk) VALUES (%s, %s, %s, %s)"
        session.execute(insert_query, (project, sensor, trace, chunk_data))
        
        trace += 1

# 关闭连接
session.shutdown()
cluster.shutdown()

查询keyspace中表


def query_tables_in_keyspace(keyspace_name):
    # 连接到Cassandra数据库
    cluster = Cluster(['192.168.92.159', '192.168.92.160','192.168.92.161'])
    session = cluster.connect(keyspace_name)

    # 查询当前keyspace中的所有表
    tables_query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s;"
    result = session.execute(tables_query, [keyspace_name])

    # 打印表名
    for row in result:
        print(row.table_name)
    
    # 关闭连接
    cluster.shutdown()

# 调用函数，传入keyspace名称
keyspace_name = 'seismic_keyspace'
query_tables_in_keyspace(keyspace_name)

查询数据


from cassandra.cluster import Cluster

# 连接到Cassandra集群
cassandra_host = ['192.168.92.159', '192.168.92.160','192.168.92.161']
cluster = Cluster(cassandra_host)  # 替换为实际的Cassandra主机地址
session = cluster.connect()

# 指定要查询的keyspace和table名称
keyspace_name = 'seismic_keyspace'
table_name = 'project_chunks_table'

# 切换到指定keyspace
session.set_keyspace(keyspace_name)

# 查询表的数据
query = f"SELECT project, sensor, trace, chunk FROM {table_name}"
result = session.execute(query)

# 输出查询结果
for row in result:
    print(f"Project: {row.project}, Sensor: {row.sensor}, Trace: {row.trace}, Chunk Size: {len(row.chunk)} bytes")

# 关闭连接
session.shutdown()
cluster.shutdown()

合并文件

python实现cassandra命名空间seimic_keyspace里表project_chunks_table，表里的字段有project，sensor，trace，chunk，其中project为文本类型，sensor、trace为整数类型，chunk为二进制类型，大小设置为255K。以project和sensor为分区键，trace字段为Clustering Key。将表的chunk字段记录的二进制数据读出，按Trace升序进行合并，合并的文件命名为common.segy，保存在data目录下

将二进制文件合并成一个segy数据


from cassandra.cluster import Cluster
import os

# 连接到Cassandra集群
cassandra_host = ['192.168.92.159', '192.168.92.160', '192.168.92.161']
cluster = Cluster(cassandra_host)  # 替换为实际的Cassandra主机地址
session = cluster.connect()

# 指定要处理的keyspace和table名称
keyspace_name = 'seismic_keyspace'
table_name = 'project_chunks_table'

# 切换到指定keyspace
session.set_keyspace(keyspace_name)

# 查询分区键的唯一组合
unique_combinations_query = f"SELECT DISTINCT project, sensor FROM {table_name}"
unique_combinations_result = session.execute(unique_combinations_query)

# 合并数据的二进制变量
merged_data = b""

# 对每个唯一组合执行查询和数据合并
for combination_row in unique_combinations_result:
    project = combination_row.project
    sensor = combination_row.sensor

    # 查询并按trace升序获取chunk数据
    query = f"SELECT trace, chunk FROM {table_name} WHERE project = %s AND sensor = %s ORDER BY trace"
    result = session.execute(query, (project, sensor))

    # 合并chunk数据为一个大的二进制数据
    for row in result:
        merged_data += row.chunk

# 关闭连接
session.shutdown()
cluster.shutdown()

# 将合并后的数据保存为common.segy
output_file_path = 'data/common.segy'  # 保存的文件路径
with open(output_file_path, 'wb') as output_file:
    output_file.write(merged_data)

print(f"Merged data saved to {output_file_path}")