Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions dev/docker-compose-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,22 @@

services:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is also a yaml sort hook, but I'm not sure if it will improve readability: https://github.com/leehambley/pre-commit-sort-yaml-keys

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ty! i can look into this

spark-iceberg:
container_name: pyiceberg-spark
image: pyiceberg-spark:latest
build: spark/
container_name: pyiceberg-spark
networks:
iceberg_net:
depends_on:
- rest
- hive
- minio
ports:
- 15002:15002 # Spark Connect
- 4040:4040 # Spark UI
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
ports:
- 15002:15002 # Spark Connect
- 4040:4040 # Spark UI
links:
- rest:rest
- hive:hive
Expand Down Expand Up @@ -60,25 +61,25 @@ services:
minio:
image: minio/minio
container_name: pyiceberg-minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
iceberg_net:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
container_name: pyiceberg-mc
networks:
iceberg_net:
depends_on:
- minio
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
Expand All @@ -91,6 +92,7 @@ services:
tail -f /dev/null
"
hive:
image: pyiceberg-hive:latest
build: hive/
container_name: pyiceberg-hive
hostname: hive
Expand Down
25 changes: 18 additions & 7 deletions dev/hive/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,28 @@

FROM apache/hive:4.0.0

ENV HADOOP_VERSION=3.3.6
ENV AWS_SDK_BUNDLE=1.12.753
# Dependency versions - changing these invalidates the JAR download layer
ARG HADOOP_VERSION=3.3.6
ARG AWS_SDK_BUNDLE=1.12.753
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2

USER root

# Install curl, download JARs, and cleanup in a single layer
RUN apt-get update -qq && apt-get -qq -y install curl && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install curl (separate layer - rarely changes)
RUN apt-get update -qq && \
apt-get -qq -y install --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download JARs with retry logic (slow layer - only changes when versions change)
RUN curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" && \
curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar"

# Copy configuration last (changes more frequently than JARs)
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml

USER hive
46 changes: 21 additions & 25 deletions dev/spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}

# Dependency versions - keep these compatible
# Changing these will invalidate the JAR download cache layer
ARG ICEBERG_VERSION=1.10.1
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
ARG HADOOP_VERSION=3.4.1
ARG SCALA_VERSION=2.13
ARG AWS_SDK_VERSION=2.24.6
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2

USER root
WORKDIR ${SPARK_HOME}

# Install curl for JAR downloads
RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
rm -rf /var/lib/apt/lists/*

# Copy configuration (early for better caching)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

# Create event log directory
RUN mkdir -p /home/iceberg/spark-events && \
# Install curl and create directories
RUN apt-get update -qq && \
apt-get install -qq -y --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
mkdir -p /home/iceberg/spark-events && \
chown -R spark:spark /home/iceberg

# Required JAR dependencies
ENV JARS_TO_DOWNLOAD="\
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"

# Download JARs with retry logic
# Download JARs with retry logic (most cacheable - only changes when versions change)
# This is the slowest step, so we do it before copying config files
RUN set -e && \
cd "${SPARK_HOME}/jars" && \
for jar_path in ${JARS_TO_DOWNLOAD}; do \
for jar_path in \
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
do \
jar_name=$(basename "${jar_path}") && \
echo "Downloading ${jar_name}..." && \
curl -fsSL --retry 3 --retry-delay 5 \
-o "${jar_name}" \
"${MAVEN_MIRROR}/${jar_path}" && \
echo "✓ Downloaded ${jar_name}"; \
done && \
chown -R spark:spark "${SPARK_HOME}/jars"
chown spark:spark "${jar_name}"; \
done

# Copy configuration last (changes more frequently than JARs)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

USER spark
WORKDIR ${SPARK_HOME}

# Start Spark Connect server
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]
CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]